diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c index 8f1242826c9f..b768aac55ef8 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c @@ -1,808 +1,808 @@ /*- * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "en.h" #include #include #include #include #ifdef KERN_TLS MALLOC_DEFINE(M_MLX5E_TLS, "MLX5E_TLS", "MLX5 ethernet HW TLS"); /* software TLS context */ struct mlx5_ifc_sw_tls_cntx_bits { struct mlx5_ifc_tls_static_params_bits param; struct mlx5_ifc_tls_progress_params_bits progress; struct { uint8_t key_data[8][0x20]; uint8_t key_len[0x20]; } key; }; CTASSERT(MLX5_ST_SZ_BYTES(sw_tls_cntx) <= sizeof(((struct mlx5e_tls_tag *)0)->crypto_params)); CTASSERT(MLX5_ST_SZ_BYTES(mkc) == sizeof(((struct mlx5e_tx_umr_wqe *)0)->mkc)); static const char *mlx5e_tls_stats_desc[] = { MLX5E_TLS_STATS(MLX5E_STATS_DESC) }; static void mlx5e_tls_work(struct work_struct *); static int mlx5e_tls_tag_zinit(void *mem, int size, int flags) { struct mlx5e_tls_tag *ptag = mem; MPASS(size == sizeof(*ptag)); memset(ptag, 0, sizeof(*ptag)); mtx_init(&ptag->mtx, "mlx5-tls-tag-mtx", NULL, MTX_DEF); INIT_WORK(&ptag->work, mlx5e_tls_work); return (0); } static void mlx5e_tls_tag_zfini(void *mem, int size) { struct mlx5e_tls_tag *ptag = mem; struct mlx5e_priv *priv; struct mlx5e_tls *ptls; ptls = ptag->tls; priv = container_of(ptls, struct mlx5e_priv, tls); flush_work(&ptag->work); if (ptag->tisn != 0) { mlx5_tls_close_tis(priv->mdev, ptag->tisn); atomic_add_32(&ptls->num_resources, -1U); } mtx_destroy(&ptag->mtx); } static void mlx5e_tls_tag_zfree(struct mlx5e_tls_tag *ptag) { /* reset some variables */ ptag->state = MLX5E_TLS_ST_INIT; ptag->dek_index = 0; ptag->dek_index_ok = 0; /* avoid leaking keys */ memset(ptag->crypto_params, 0, sizeof(ptag->crypto_params)); /* update number of TIS contexts */ if (ptag->tisn == 0) atomic_add_32(&ptag->tls->num_resources, -1U); /* return tag to UMA */ uma_zfree(ptag->tls->zone, ptag); } int mlx5e_tls_init(struct mlx5e_priv *priv) { struct mlx5e_tls *ptls = &priv->tls; struct sysctl_oid *node; uint32_t x; if (MLX5_CAP_GEN(priv->mdev, tls_tx) == 0) return (0); ptls->wq = create_singlethread_workqueue("mlx5-tls-wq"); if (ptls->wq == NULL) return (ENOMEM); sysctl_ctx_init(&ptls->ctx); snprintf(ptls->zname, sizeof(ptls->zname), "mlx5_%u_tls", device_get_unit(priv->mdev->pdev->dev.bsddev)); ptls->zone = uma_zcreate(ptls->zname, sizeof(struct mlx5e_tls_tag), NULL, NULL, mlx5e_tls_tag_zinit, mlx5e_tls_tag_zfini, UMA_ALIGN_CACHE, 0); ptls->max_resources = 1U << MLX5_CAP_GEN(priv->mdev, log_max_dek); for (x = 0; x != MLX5E_TLS_STATS_NUM; x++) ptls->stats.arg[x] = counter_u64_alloc(M_WAITOK); ptls->init = 1; node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tls", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hardware TLS offload"); if (node == NULL) return (0); mlx5e_create_counter_stats(&ptls->ctx, SYSCTL_CHILDREN(node), "stats", mlx5e_tls_stats_desc, MLX5E_TLS_STATS_NUM, ptls->stats.arg); return (0); } void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { struct mlx5e_tls *ptls = &priv->tls; uint32_t x; if (MLX5_CAP_GEN(priv->mdev, tls_tx) == 0) return; ptls->init = 0; flush_workqueue(ptls->wq); sysctl_ctx_free(&ptls->ctx); uma_zdestroy(ptls->zone); destroy_workqueue(ptls->wq); /* check if all resources are freed */ MPASS(priv->tls.num_resources == 0); for (x = 0; x != MLX5E_TLS_STATS_NUM; x++) counter_u64_free(ptls->stats.arg[x]); } static void mlx5e_tls_work(struct work_struct *work) { struct mlx5e_tls_tag *ptag; struct mlx5e_priv *priv; int err; ptag = container_of(work, struct mlx5e_tls_tag, work); priv = container_of(ptag->tls, struct mlx5e_priv, tls); switch (ptag->state) { case MLX5E_TLS_ST_INIT: /* try to open TIS, if not present */ if (ptag->tisn == 0) { err = mlx5_tls_open_tis(priv->mdev, 0, priv->tdn, priv->pdn, &ptag->tisn); if (err) { MLX5E_TLS_STAT_INC(ptag, tx_error, 1); break; } } MLX5_SET(sw_tls_cntx, ptag->crypto_params, progress.pd, ptag->tisn); /* try to allocate a DEK context ID */ err = mlx5_encryption_key_create(priv->mdev, priv->pdn, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, key.key_data), MLX5_GET(sw_tls_cntx, ptag->crypto_params, key.key_len), &ptag->dek_index); if (err) { MLX5E_TLS_STAT_INC(ptag, tx_error, 1); break; } MLX5_SET(sw_tls_cntx, ptag->crypto_params, param.dek_index, ptag->dek_index); ptag->dek_index_ok = 1; MLX5E_TLS_TAG_LOCK(ptag); if (ptag->state == MLX5E_TLS_ST_INIT) ptag->state = MLX5E_TLS_ST_SETUP; MLX5E_TLS_TAG_UNLOCK(ptag); break; case MLX5E_TLS_ST_FREED: /* wait for all refs to go away */ while (ptag->refs != 0) msleep(1); /* try to destroy DEK context by ID */ if (ptag->dek_index_ok) err = mlx5_encryption_key_destroy(priv->mdev, ptag->dek_index); /* free tag */ mlx5e_tls_tag_zfree(ptag); break; default: break; } } static int mlx5e_tls_set_params(void *ctx, const struct tls_session_params *en) { MLX5_SET(sw_tls_cntx, ctx, param.const_2, 2); if (en->tls_vminor == TLS_MINOR_VER_TWO) MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 2); /* v1.2 */ else MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 3); /* v1.3 */ MLX5_SET(sw_tls_cntx, ctx, param.const_1, 1); MLX5_SET(sw_tls_cntx, ctx, param.encryption_standard, 1); /* TLS */ /* copy the initial vector in place */ switch (en->iv_len) { case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv): case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv) + MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv): memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.gcm_iv), en->iv, en->iv_len); break; default: return (EINVAL); } if (en->cipher_key_len <= MLX5_FLD_SZ_BYTES(sw_tls_cntx, key.key_data)) { memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, key.key_data), en->cipher_key, en->cipher_key_len); MLX5_SET(sw_tls_cntx, ctx, key.key_len, en->cipher_key_len); } else { return (EINVAL); } return (0); } /* Verify zero default */ CTASSERT(MLX5E_TLS_ST_INIT == 0); int mlx5e_tls_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { union if_snd_tag_alloc_params rl_params; struct mlx5e_priv *priv; struct mlx5e_tls_tag *ptag; const struct tls_session_params *en; int error; priv = ifp->if_softc; if (priv->tls.init == 0) return (EOPNOTSUPP); /* allocate new tag from zone, if any */ ptag = uma_zalloc(priv->tls.zone, M_NOWAIT); if (ptag == NULL) return (ENOMEM); /* sanity check default values */ MPASS(ptag->state == MLX5E_TLS_ST_INIT); MPASS(ptag->dek_index == 0); MPASS(ptag->dek_index_ok == 0); /* setup TLS tag */ ptag->tls = &priv->tls; /* check if there is no TIS context */ if (ptag->tisn == 0) { uint32_t value; value = atomic_fetchadd_32(&priv->tls.num_resources, 1U); /* check resource limits */ if (value >= priv->tls.max_resources) { error = ENOMEM; goto failure; } } en = ¶ms->tls.tls->params; /* only TLS v1.2 and v1.3 is currently supported */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE || (en->tls_vminor != TLS_MINOR_VER_TWO #ifdef TLS_MINOR_VER_THREE && en->tls_vminor != TLS_MINOR_VER_THREE #endif )) { error = EPROTONOSUPPORT; goto failure; } switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: switch (en->cipher_key_len) { case 128 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_set_params(ptag->crypto_params, en); if (error) goto failure; break; case 256 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_set_params(ptag->crypto_params, en); if (error) goto failure; break; default: error = EINVAL; goto failure; } break; default: error = EPROTONOSUPPORT; goto failure; } memset(&rl_params, 0, sizeof(rl_params)); rl_params.hdr = params->hdr; switch (params->hdr.type) { #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: rl_params.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT; rl_params.max_rate = params->tls_rate_limit.max_rate; break; #endif case IF_SND_TAG_TYPE_TLS: rl_params.hdr.type = IF_SND_TAG_TYPE_UNLIMITED; if (error) goto failure; break; default: error = EOPNOTSUPP; goto failure; } - error = ifp->if_snd_tag_alloc(ifp, &rl_params, &ptag->rl_tag); + error = m_snd_tag_alloc(ifp, &rl_params, &ptag->rl_tag); if (error) goto failure; /* store pointer to mbuf tag */ MPASS(ptag->tag.refcount == 0); m_snd_tag_init(&ptag->tag, ifp, params->hdr.type); *ppmt = &ptag->tag; queue_work(priv->tls.wq, &ptag->work); flush_work(&ptag->work); return (0); failure: mlx5e_tls_tag_zfree(ptag); return (error); } int mlx5e_tls_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) union if_snd_tag_modify_params rl_params; struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); int error; #endif switch (pmt->type) { #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: memset(&rl_params, 0, sizeof(rl_params)); rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate; error = ptag->rl_tag->ifp->if_snd_tag_modify(ptag->rl_tag, &rl_params); return (error); #endif default: return (EOPNOTSUPP); } } int mlx5e_tls_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); int error; switch (pmt->type) { #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: #endif case IF_SND_TAG_TYPE_TLS: error = ptag->rl_tag->ifp->if_snd_tag_query(ptag->rl_tag, params); break; default: error = EOPNOTSUPP; break; } return (error); } void mlx5e_tls_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); struct mlx5e_priv *priv; MPASS(ptag->rl_tag->refcount == 1); m_snd_tag_rele(ptag->rl_tag); MLX5E_TLS_TAG_LOCK(ptag); ptag->state = MLX5E_TLS_ST_FREED; MLX5E_TLS_TAG_UNLOCK(ptag); priv = ptag->tag.ifp->if_softc; queue_work(priv->tls.wq, &ptag->work); } CTASSERT((MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) % 16) == 0); static void mlx5e_tls_send_static_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_umr_wqe) + MLX5_FLD_SZ_BYTES(sw_tls_cntx, param), MLX5_SEND_WQE_DS); struct mlx5e_tx_umr_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_UMR | (MLX5_OPCODE_MOD_UMR_TLS_TIS_STATIC_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(ptag->tisn << 8); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; else wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; /* fill out UMR control segment */ wqe->umr.flags = 0x80; /* inline data */ wqe->umr.bsf_octowords = cpu_to_be16(MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) / 16); /* copy in the static crypto parameters */ memcpy(wqe + 1, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, param), MLX5_FLD_SZ_BYTES(sw_tls_cntx, param)); /* copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].p_refcount = &ptag->refs; atomic_add_int(&ptag->refs, 1); sq->pc += sq->mbuf[pi].num_wqebbs; } CTASSERT(MLX5_FLD_SZ_BYTES(sw_tls_cntx, progress) == sizeof(((struct mlx5e_tx_psv_wqe *)0)->psv)); static void mlx5e_tls_send_progress_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_psv_wqe), MLX5_SEND_WQE_DS); struct mlx5e_tx_psv_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIS_PROGRESS_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; /* copy in the PSV control segment */ memcpy(&wqe->psv, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, progress), sizeof(wqe->psv)); /* copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].p_refcount = &ptag->refs; atomic_add_int(&ptag->refs, 1); sq->pc += sq->mbuf[pi].num_wqebbs; } static void mlx5e_tls_send_nop(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = MLX5_SEND_WQEBB_NUM_DS; struct mlx5e_tx_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; else wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].p_refcount = &ptag->refs; atomic_add_int(&ptag->refs, 1); sq->pc += sq->mbuf[pi].num_wqebbs; } #define SBTLS_MBUF_NO_DATA ((struct mbuf *)1) static struct mbuf * sbtls_recover_record(struct mbuf *mb, int wait, uint32_t tcp_old, uint32_t *ptcp_seq, bool *pis_start) { struct mbuf *mr, *top; uint32_t offset; uint32_t delta; /* check format of incoming mbuf */ if (mb->m_next == NULL || (mb->m_next->m_flags & (M_EXTPG | M_EXT)) != (M_EXTPG | M_EXT)) { top = NULL; goto done; } /* get unmapped data offset */ offset = mtod(mb->m_next, uintptr_t); /* check if we don't need to re-transmit anything */ if (offset == 0) { top = SBTLS_MBUF_NO_DATA; *pis_start = true; goto done; } /* try to get a new packet header */ top = m_gethdr(wait, MT_DATA); if (top == NULL) goto done; mr = m_get(wait, MT_DATA); if (mr == NULL) { m_free(top); top = NULL; goto done; } top->m_next = mr; mb_dupcl(mr, mb->m_next); /* the beginning of the TLS record */ mr->m_data = NULL; /* setup packet header length */ top->m_pkthdr.len = mr->m_len = offset; top->m_len = 0; /* check for partial re-transmit */ delta = *ptcp_seq - tcp_old; if (delta < offset) { m_adj(top, offset - delta); offset = delta; /* continue where we left off */ *pis_start = false; } else { *pis_start = true; } /* * Rewind the TCP sequence number by the amount of data * retransmitted: */ *ptcp_seq -= offset; done: return (top); } static int mlx5e_sq_tls_populate(struct mbuf *mb, uint64_t *pseq) { for (; mb != NULL; mb = mb->m_next) { if (!(mb->m_flags & M_EXTPG)) continue; *pseq = mb->m_epg_seqno; return (1); } return (0); } int mlx5e_sq_tls_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **ppmb) { struct mlx5e_tls_tag *ptls_tag; struct m_snd_tag *ptag; const struct tcphdr *th; struct mbuf *mb = *ppmb; u64 rcd_sn; u32 header_size; u32 mb_seq; if ((mb->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0) return (MLX5E_TLS_CONTINUE); ptag = mb->m_pkthdr.snd_tag; if ( #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) ptag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT && #endif ptag->type != IF_SND_TAG_TYPE_TLS) return (MLX5E_TLS_CONTINUE); ptls_tag = container_of(ptag, struct mlx5e_tls_tag, tag); header_size = mlx5e_get_full_header_size(mb, &th); if (unlikely(header_size == 0 || th == NULL)) return (MLX5E_TLS_FAILURE); /* * Send non-TLS TCP packets AS-IS: */ if (header_size == mb->m_pkthdr.len || mlx5e_sq_tls_populate(mb, &rcd_sn) == 0) { parg->tisn = 0; parg->ihs = header_size; return (MLX5E_TLS_CONTINUE); } mb_seq = ntohl(th->th_seq); MLX5E_TLS_TAG_LOCK(ptls_tag); switch (ptls_tag->state) { case MLX5E_TLS_ST_INIT: MLX5E_TLS_TAG_UNLOCK(ptls_tag); return (MLX5E_TLS_FAILURE); case MLX5E_TLS_ST_SETUP: ptls_tag->state = MLX5E_TLS_ST_TXRDY; ptls_tag->expected_seq = ~mb_seq; /* force setup */ default: MLX5E_TLS_TAG_UNLOCK(ptls_tag); break; } if (unlikely(ptls_tag->expected_seq != mb_seq)) { bool is_start; struct mbuf *r_mb; uint32_t tcp_seq = mb_seq; r_mb = sbtls_recover_record(mb, M_NOWAIT, ptls_tag->expected_seq, &tcp_seq, &is_start); if (r_mb == NULL) { MLX5E_TLS_STAT_INC(ptls_tag, tx_error, 1); return (MLX5E_TLS_FAILURE); } MLX5E_TLS_STAT_INC(ptls_tag, tx_packets_ooo, 1); /* check if this is the first fragment of a TLS record */ if (is_start) { /* setup TLS static parameters */ MLX5_SET64(sw_tls_cntx, ptls_tag->crypto_params, param.initial_record_number, rcd_sn); /* * NOTE: The sendqueue should have enough room to * carry both the static and the progress parameters * when we get here! */ mlx5e_tls_send_static_parameters(sq, ptls_tag); mlx5e_tls_send_progress_parameters(sq, ptls_tag); if (r_mb == SBTLS_MBUF_NO_DATA) { mlx5e_tls_send_nop(sq, ptls_tag); ptls_tag->expected_seq = mb_seq; return (MLX5E_TLS_LOOP); } } MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes_ooo, r_mb->m_pkthdr.len); /* setup transmit arguments */ parg->tisn = ptls_tag->tisn; parg->pref = &ptls_tag->refs; /* try to send DUMP data */ if (mlx5e_sq_dump_xmit(sq, parg, &r_mb) != 0) { m_freem(r_mb); ptls_tag->expected_seq = tcp_seq; return (MLX5E_TLS_FAILURE); } else { ptls_tag->expected_seq = mb_seq; return (MLX5E_TLS_LOOP); } } else { MLX5E_TLS_STAT_INC(ptls_tag, tx_packets, 1); MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes, mb->m_pkthdr.len); } ptls_tag->expected_seq += mb->m_pkthdr.len - header_size; parg->tisn = ptls_tag->tisn; parg->ihs = header_size; parg->pref = &ptls_tag->refs; return (MLX5E_TLS_CONTINUE); } #else int mlx5e_tls_init(struct mlx5e_priv *priv) { return (0); } void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { /* NOP */ } #endif /* KERN_TLS */ diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index 2c9b44309484..84e068424427 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -1,1649 +1,1659 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Primary Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Primary Zone, * thus sharing backend Slab kegs with the Mbuf Primary Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Primary Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ bool mb_use_ext_pgs; /* use M_EXTPG mbufs for sendfile & TLS */ SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, &mb_use_ext_pgs, 0, "Use unmapped mbufs for sendfile(2) and TLS offload"); static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); static counter_u64_t snd_tag_count; SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, &snd_tag_count, "# of active mbuf send tags"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); _Static_assert(sizeof(struct mbuf) <= MSIZE, "size of mbuf exceeds MSIZE"); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); #ifdef DEBUGNET /* * debugnet makes use of a pre-allocated pool of mbufs and clusters. When * debugnet is configured, we initialize a set of UMA cache zones which return * items from this pool. At panic-time, the regular UMA zone pointers are * overwritten with those of the cache zones so that drivers may allocate and * free mbufs and clusters without attempting to allocate physical memory. * * We keep mbufs and clusters in a pair of mbuf queues. In particular, for * the purpose of caching clusters, we treat them as mbufs. */ static struct mbufq dn_mbufq = { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; static struct mbufq dn_clustq = { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; static int dn_clsize; static uma_zone_t dn_zone_mbuf; static uma_zone_t dn_zone_clust; static uma_zone_t dn_zone_pack; static struct debugnet_saved_zones { uma_zone_t dsz_mbuf; uma_zone_t dsz_clust; uma_zone_t dsz_pack; uma_zone_t dsz_jumbop; uma_zone_t dsz_jumbo9; uma_zone_t dsz_jumbo16; bool dsz_debugnet_zones_enabled; } dn_saved_zones; static int dn_buf_import(void *arg, void **store, int count, int domain __unused, int flags) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = mbufq_dequeue(q); if (m == NULL) break; trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_buf_release(void *arg, void **store, int count) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = store[i]; (void)mbufq_enqueue(q, m); } } static int dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, int flags __unused) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = m_get(MT_DATA, M_NOWAIT); if (m == NULL) break; clust = uma_zalloc(dn_zone_clust, M_NOWAIT); if (clust == NULL) { m_free(m); break; } mb_ctor_clust(clust, dn_clsize, m, 0); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_pack_release(void *arg __unused, void **store, int count) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = store[i]; clust = m->m_ext.ext_buf; uma_zfree(dn_zone_clust, clust); uma_zfree(dn_zone_mbuf, m); } } /* * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy * the corresponding UMA cache zones. */ void debugnet_mbuf_drain(void) { struct mbuf *m; void *item; if (dn_zone_mbuf != NULL) { uma_zdestroy(dn_zone_mbuf); dn_zone_mbuf = NULL; } if (dn_zone_clust != NULL) { uma_zdestroy(dn_zone_clust); dn_zone_clust = NULL; } if (dn_zone_pack != NULL) { uma_zdestroy(dn_zone_pack); dn_zone_pack = NULL; } while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) m_free(m); while ((item = mbufq_dequeue(&dn_clustq)) != NULL) uma_zfree(m_getzone(dn_clsize), item); } /* * Callback invoked immediately prior to starting a debugnet connection. */ void debugnet_mbuf_start(void) { MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); /* Save the old zone pointers to restore when debugnet is closed. */ dn_saved_zones = (struct debugnet_saved_zones) { .dsz_debugnet_zones_enabled = true, .dsz_mbuf = zone_mbuf, .dsz_clust = zone_clust, .dsz_pack = zone_pack, .dsz_jumbop = zone_jumbop, .dsz_jumbo9 = zone_jumbo9, .dsz_jumbo16 = zone_jumbo16, }; /* * All cluster zones return buffers of the size requested by the * drivers. It's up to the driver to reinitialize the zones if the * MTU of a debugnet-enabled interface changes. */ printf("debugnet: overwriting mbuf zone pointers\n"); zone_mbuf = dn_zone_mbuf; zone_clust = dn_zone_clust; zone_pack = dn_zone_pack; zone_jumbop = dn_zone_clust; zone_jumbo9 = dn_zone_clust; zone_jumbo16 = dn_zone_clust; } /* * Callback invoked when a debugnet connection is closed/finished. */ void debugnet_mbuf_finish(void) { MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); printf("debugnet: restoring mbuf zone pointers\n"); zone_mbuf = dn_saved_zones.dsz_mbuf; zone_clust = dn_saved_zones.dsz_clust; zone_pack = dn_saved_zones.dsz_pack; zone_jumbop = dn_saved_zones.dsz_jumbop; zone_jumbo9 = dn_saved_zones.dsz_jumbo9; zone_jumbo16 = dn_saved_zones.dsz_jumbo16; memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); } /* * Reinitialize the debugnet mbuf+cluster pool and cache zones. */ void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) { struct mbuf *m; void *item; debugnet_mbuf_drain(); dn_clsize = clsize; dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, dn_buf_import, dn_buf_release, &dn_mbufq, UMA_ZONE_NOBUCKET); dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, clsize, mb_ctor_clust, NULL, NULL, NULL, dn_buf_import, dn_buf_release, &dn_clustq, UMA_ZONE_NOBUCKET); dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, dn_pack_import, dn_pack_release, NULL, UMA_ZONE_NOBUCKET); while (nmbuf-- > 0) { m = m_get(MT_DATA, M_WAITOK); uma_zfree(dn_zone_mbuf, m); } while (nclust-- > 0) { item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); uma_zfree(dn_zone_clust, item); } } #endif /* DEBUGNET */ /* * Constructor for Mbuf primary zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf primary zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, cause them to be woken up by draining the * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted(zone_clust)) uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { struct epoch_tracker et; struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); NET_EPOCH_ENTER(et); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); NET_EPOCH_EXIT(et); } /* * Free "count" units of I/O from an mbuf chain. They could be held * in M_EXTPG or just as a normal mbuf. This code is intended to be * called in an error path (I/O error, closed connection, etc). */ void mb_free_notready(struct mbuf *m, int count) { int i; for (i = 0; i < count && m != NULL; i++) { if ((m->m_flags & M_EXTPG) != 0) { m->m_epg_nrdy--; if (m->m_epg_nrdy != 0) continue; } m = m_free(m); } KASSERT(i == count, ("Removed only %d items from %p", i, m)); } /* * Compress an unmapped mbuf into a simple mbuf when it holds a small * amount of data. This is used as a DOS defense to avoid having * small packets tie up wired pages, an ext_pgs structure, and an * mbuf. Since this converts the existing mbuf in place, it can only * be used if there are no other references to 'm'. */ int mb_unmapped_compress(struct mbuf *m) { volatile u_int *refcnt; char buf[MLEN]; /* * Assert that 'm' does not have a packet header. If 'm' had * a packet header, it would only be able to hold MHLEN bytes * and m_data would have to be initialized differently. */ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt != 1) return (EBUSY); m_copydata(m, 0, m->m_len, buf); /* Free the backing pages. */ m->m_ext.ext_free(m); /* Turn 'm' into a "normal" mbuf. */ m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); m->m_data = m->m_dat; /* Copy data back into m. */ bcopy(buf, mtod(m, char *), m->m_len); return (0); } /* * These next few routines are used to permit downgrading an unmapped * mbuf to a chain of mapped mbufs. This is used when an interface * doesn't supported unmapped mbufs or if checksums need to be * computed in software. * * Each unmapped mbuf is converted to a chain of mbufs. First, any * TLS header data is stored in a regular mbuf. Second, each page of * unmapped data is stored in an mbuf with an EXT_SFBUF external * cluster. These mbufs use an sf_buf to provide a valid KVA for the * associated physical page. They also hold a reference on the * original M_EXTPG mbuf to ensure the physical page doesn't go away. * Finally, any TLS trailer data is stored in a regular mbuf. * * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF * mbufs. It frees the associated sf_buf and releases its reference * on the original M_EXTPG mbuf. * * _mb_unmapped_to_ext() is a helper function that converts a single * unmapped mbuf into a chain of mbufs. * * mb_unmapped_to_ext() is the public function that walks an mbuf * chain converting any unmapped mbufs to mapped mbufs. It returns * the new chain of unmapped mbufs on success. On failure it frees * the original mbuf chain and returns NULL. */ static void mb_unmapped_free_mext(struct mbuf *m) { struct sf_buf *sf; struct mbuf *old_m; sf = m->m_ext.ext_arg1; sf_buf_free(sf); /* Drop the reference on the backing M_EXTPG mbuf. */ old_m = m->m_ext.ext_arg2; mb_free_extpg(old_m); } static struct mbuf * _mb_unmapped_to_ext(struct mbuf *m) { struct mbuf *m_new, *top, *prev, *mref; struct sf_buf *sf; vm_page_t pg; int i, len, off, pglen, pgoff, seglen, segoff; volatile u_int *refcnt; u_int ref_inc = 0; M_ASSERTEXTPG(m); len = m->m_len; KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Skip over any data removed from the front. */ off = mtod(m, vm_offset_t); top = NULL; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; m_new->m_len = seglen; prev = top = m_new; memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], seglen); } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) { top = prev = m_new; } else { prev->m_next = m_new; prev = m_new; } sf = sf_buf_alloc(pg, SFB_NOWAIT); if (sf == NULL) goto fail; ref_inc++; m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); m_new->m_data += segoff; m_new->m_len = seglen; pgoff = 0; }; if (len != 0) { KASSERT((off + len) <= m->m_epg_trllen, ("off + len > trail (%d + %d > %d)", off, len, m->m_epg_trllen)); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) top = m_new; else prev->m_next = m_new; m_new->m_len = len; memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); } if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be dropped * in mb_unmapped_free_mext(). */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); return (top); fail: if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be * immediately dropped when these mbufs are freed * below. */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); m_freem(top); return (NULL); } struct mbuf * mb_unmapped_to_ext(struct mbuf *top) { struct mbuf *m, *next, *prev = NULL; prev = NULL; for (m = top; m != NULL; m = next) { /* m might be freed, so cache the next pointer. */ next = m->m_next; if (m->m_flags & M_EXTPG) { if (prev != NULL) { /* * Remove 'm' from the new chain so * that the 'top' chain terminates * before 'm' in case 'top' is freed * due to an error. */ prev->m_next = NULL; } m = _mb_unmapped_to_ext(m); if (m == NULL) { m_freem(top); m_freem(next); return (NULL); } if (prev == NULL) { top = m; } else { prev->m_next = m; } /* * Replaced one mbuf with a chain, so we must * find the end of chain. */ prev = m_last(m); } else { if (prev != NULL) { prev->m_next = m; } prev = m; } } return (top); } /* * Allocate an empty M_EXTPG mbuf. The ext_free routine is * responsible for freeing any pages backing this mbuf when it is * freed. */ struct mbuf * mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) { struct mbuf *m; m = m_get(how, MT_DATA); if (m == NULL) return (NULL); m->m_epg_npgs = 0; m->m_epg_nrdy = 0; m->m_epg_1st_off = 0; m->m_epg_last_len = 0; m->m_epg_flags = 0; m->m_epg_hdrlen = 0; m->m_epg_trllen = 0; m->m_epg_tls = NULL; m->m_epg_so = NULL; m->m_data = NULL; m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_ext.ext_size = 0; m->m_ext.ext_free = ext_free; return (m); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF || m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); uma_zfree(zone_mbuf, mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; case EXT_RXRING: KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free is set", __func__)); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) uma_zfree(zone_mbuf, m); } /* * Clean up after mbufs with M_EXTPG storage attached to them if the * reference count hits 1. */ void mb_free_extpg(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; M_ASSERTEXTPG(m); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); #ifdef KERN_TLS if (mref->m_epg_tls != NULL && !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) ktls_enqueue_to_free(mref); else #endif uma_zfree(zone_mbuf, mref); } if (m != mref) uma_zfree(zone_mbuf, m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } MBUF_PROBE5(m__getjcl, how, type, flags, size, m); return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { mb = NULL; if (len > MCLBYTES) { mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), MJUMPAGESIZE); } if (mb == NULL) { if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* * Fail the whole operation if one mbuf can't be * allocated. */ if (mb == NULL) { m_freem(nm); return (NULL); } } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } +int +m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, + struct m_snd_tag **mstp) +{ + + if (ifp->if_snd_tag_alloc == NULL) + return (EOPNOTSUPP); + return (ifp->if_snd_tag_alloc(ifp, params, mstp)); +} + void m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, u_int type) { if_ref(ifp); mst->ifp = ifp; refcount_init(&mst->refcount, 1); mst->type = type; counter_u64_add(snd_tag_count, 1); } void m_snd_tag_destroy(struct m_snd_tag *mst) { struct ifnet *ifp; ifp = mst->ifp; ifp->if_snd_tag_free(mst); if_rele(ifp); counter_u64_add(snd_tag_count, -1); } /* * Allocate an mbuf with anonymous external pages. */ struct mbuf * mb_alloc_ext_plus_pages(int len, int how) { struct mbuf *m; vm_page_t pg; int i, npgs; m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (m == NULL) return (NULL); m->m_epg_flags |= EPG_FLAG_ANON; npgs = howmany(len, PAGE_SIZE); for (i = 0; i < npgs; i++) { do { pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) { if (how == M_NOWAIT) { m->m_epg_npgs = i; m_free(m); return (NULL); } vm_wait(NULL); } } while (pg == NULL); m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); } m->m_epg_npgs = npgs; return (m); } /* * Copy the data in the mbuf chain to a chain of mbufs with anonymous external * unmapped pages. * len is the length of data in the input mbuf chain. * mlen is the maximum number of bytes put into each ext_page mbuf. */ struct mbuf * mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, struct mbuf **mlast) { struct mbuf *m, *mout; char *pgpos, *mbpos; int i, mblen, mbufsiz, pglen, xfer; if (len == 0) return (NULL); mbufsiz = min(mlen, len); m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); if (m == NULL) return (m); pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); pglen = PAGE_SIZE; mblen = 0; i = 0; do { if (pglen == 0) { if (++i == m->m_epg_npgs) { m->m_epg_last_len = PAGE_SIZE; mbufsiz = min(mlen, len); m->m_next = mb_alloc_ext_plus_pages(mbufsiz, how); m = m->m_next; if (m == NULL) { m_freem(mout); return (m); } i = 0; } pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); pglen = PAGE_SIZE; } while (mblen == 0) { if (mp == NULL) { m_freem(mout); return (NULL); } KASSERT((mp->m_flags & M_EXTPG) == 0, ("mb_copym_ext_pgs: ext_pgs input mbuf")); mbpos = mtod(mp, char *); mblen = mp->m_len; mp = mp->m_next; } xfer = min(mblen, pglen); memcpy(pgpos, mbpos, xfer); pgpos += xfer; mbpos += xfer; pglen -= xfer; mblen -= xfer; len -= xfer; m->m_len += xfer; } while (len > 0); m->m_epg_last_len = PAGE_SIZE - pglen; if (mlast != NULL) *mlast = m; return (mout); } diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index c048f708ecfe..2776d6fcd193 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1,2133 +1,2129 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #include #include #ifdef RSS #include #include #endif #include #include #if defined(INET) || defined(INET6) #include #include #endif #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; } __aligned(CACHE_LINE_SIZE); static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; LIST_HEAD(, ktls_crypto_backend) ktls_backends; static struct rmlock ktls_backends_lock; static uma_zone_t ktls_session_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload stats"); static int ktls_allow_unload; SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN, &ktls_allow_unload, 0, "Allow software crypto modules to unload"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores or domains at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); static bool ktls_offload_enable; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); static counter_u64_t ktls_tasks_active; SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static counter_u64_t ktls_cnt_tx_queued; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, &ktls_cnt_tx_queued, "Number of TLS records in queue to tasks for SW encryption"); static counter_u64_t ktls_cnt_rx_queued; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, &ktls_cnt_rx_queued, "Number of TLS sockets in queue to tasks for SW decryption"); static counter_u64_t ktls_offload_total; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static counter_u64_t ktls_offload_enable_calls; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static counter_u64_t ktls_offload_active; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static counter_u64_t ktls_offload_corrupted_records; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, &ktls_offload_corrupted_records, "Total corrupted TLS records received"); static counter_u64_t ktls_offload_failed_crypto; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static counter_u64_t ktls_switch_to_ifnet; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static counter_u64_t ktls_switch_to_sw; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static counter_u64_t ktls_switch_failed; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Hardware (ifnet) TLS session stats"); #ifdef TCP_OFFLOAD SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TOE TLS session stats"); #endif static counter_u64_t ktls_sw_cbc; SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static counter_u64_t ktls_sw_gcm; SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static counter_u64_t ktls_ifnet_cbc; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static counter_u64_t ktls_ifnet_gcm; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static counter_u64_t ktls_ifnet_reset; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static counter_u64_t ktls_ifnet_reset_dropped; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static counter_u64_t ktls_ifnet_reset_failed; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); #ifdef TCP_OFFLOAD static counter_u64_t ktls_toe_cbc; SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD, &ktls_toe_cbc, "Active number of TOE TLS sessions using AES-CBC"); static counter_u64_t ktls_toe_gcm; SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD, &ktls_toe_gcm, "Active number of TOE TLS sessions using AES-GCM"); #endif static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_cleanup(struct ktls_session *tls); #if defined(INET) || defined(INET6) static void ktls_reset_send_tag(void *context, int pending); #endif static void ktls_work_thread(void *ctx); int ktls_crypto_backend_register(struct ktls_crypto_backend *be) { struct ktls_crypto_backend *curr_be, *tmp; if (be->api_version != KTLS_API_VERSION) { printf("KTLS: API version mismatch (%d vs %d) for %s\n", be->api_version, KTLS_API_VERSION, be->name); return (EINVAL); } rm_wlock(&ktls_backends_lock); printf("KTLS: Registering crypto method %s with prio %d\n", be->name, be->prio); if (LIST_EMPTY(&ktls_backends)) { LIST_INSERT_HEAD(&ktls_backends, be, next); } else { LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) { if (curr_be->prio < be->prio) { LIST_INSERT_BEFORE(curr_be, be, next); break; } if (LIST_NEXT(curr_be, next) == NULL) { LIST_INSERT_AFTER(curr_be, be, next); break; } } } rm_wunlock(&ktls_backends_lock); return (0); } int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be) { struct ktls_crypto_backend *tmp; /* * Don't error if the backend isn't registered. This permits * MOD_UNLOAD handlers to use this function unconditionally. */ rm_wlock(&ktls_backends_lock); LIST_FOREACH(tmp, &ktls_backends, next) { if (tmp == be) break; } if (tmp == NULL) { rm_wunlock(&ktls_backends_lock); return (0); } if (!ktls_allow_unload) { rm_wunlock(&ktls_backends_lock); printf( "KTLS: Deregistering crypto method %s is not supported\n", be->name); return (EBUSY); } if (be->use_count) { rm_wunlock(&ktls_backends_lock); return (EBUSY); } LIST_REMOVE(be, next); rm_wunlock(&ktls_backends_lock); return (0); } #if defined(INET) || defined(INET6) static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; u_int cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that some crypto backends rely on the * serialization provided by having the same connection use * the same queue. */ cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif static void ktls_init(void *dummy __unused) { struct thread *td; struct pcpu *pc; cpuset_t mask; int error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK); ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK); ktls_offload_total = counter_u64_alloc(M_WAITOK); ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK); ktls_offload_active = counter_u64_alloc(M_WAITOK); ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK); ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK); ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK); ktls_switch_to_sw = counter_u64_alloc(M_WAITOK); ktls_switch_failed = counter_u64_alloc(M_WAITOK); ktls_sw_cbc = counter_u64_alloc(M_WAITOK); ktls_sw_gcm = counter_u64_alloc(M_WAITOK); ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK); ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK); #ifdef TCP_OFFLOAD ktls_toe_cbc = counter_u64_alloc(M_WAITOK); ktls_toe_gcm = counter_u64_alloc(M_WAITOK); #endif rm_init(&ktls_backends_lock, "ktls backends"); LIST_INIT(&ktls_backends); ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].m_head); STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) panic("Can't add KTLS thread %d error %d", i, error); /* * Bind threads to cores. If ktls_bind_threads is > * 1, then we bind to the NUMA domain. */ if (ktls_bind_threads) { if (ktls_bind_threads > 1) { pc = pcpu_find(i); CPU_COPY(&cpuset_domain[pc->pc_domain], &mask); } else { CPU_SETOF(i, &mask); } error = cpuset_setthread(td->td_tid, &mask); if (error) panic( "Unable to bind KTLS thread for CPU %d error %d", i, error); } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } printf("KTLS: Initialized %d threads\n", ktls_number_threads); } SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); #if defined(INET) || defined(INET6) static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.3 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv)) return (EINVAL); /* All supported algorithms require a cipher key. */ if (en->cipher_key_len == 0) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: #ifdef COMPAT_FREEBSD12 /* XXX: Really 13.0-current COMPAT. */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: #endif break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); if ((en->tls_vminor == TLS_MINOR_VER_TWO && en->iv_len != TLS_AEAD_GCM_LEN) || (en->tls_vminor == TLS_MINOR_VER_THREE && en->iv_len != TLS_1_3_GCM_IV_LEN)) return (EINVAL); break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: /* * TLS 1.0 requires an implicit IV. TLS 1.1+ * all use explicit IVs. */ if (en->tls_vminor == TLS_MINOR_VER_ZERO) { if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; } /* FALLTHROUGH */ case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); break; default: return (EINVAL); } tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte * nonce. TLS 1.3 uses a 12 byte implicit IV. */ if (en->tls_vminor < TLS_MINOR_VER_THREE) tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; /* * TLS 1.3 includes optional padding which we * do not support, and also puts the "real" record * type at the end of the encrypted data. */ if (en->tls_vminor == TLS_MINOR_VER_THREE) tls->params.tls_tlen += sizeof(uint8_t); tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; default: panic("invalid cipher"); } KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); error = copyin(en->auth_key, tls->params.auth_key, en->auth_key_len); if (error) goto out; } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); error = copyin(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); if (error) goto out; /* * This holds the implicit portion of the nonce for GCM and * the initial implicit IV for TLS 1.0. The explicit portions * of the IV are generated in ktls_frame(). */ if (en->iv_len != 0) { tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) goto out; /* * For TLS 1.2, generate an 8-byte nonce as a counter * to generate unique explicit IVs. * * Store this counter in the last 8 bytes of the IV * array so that it is 8-byte aligned. */ if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && en->tls_vminor == TLS_MINOR_VER_TWO) arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0); } *tlsp = tls; return (0); out: ktls_cleanup(tls); return (error); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #endif static void ktls_cleanup(struct ktls_session *tls) { counter_u64_add(ktls_offload_active, -1); switch (tls->mode) { case TCP_TLS_MODE_SW: MPASS(tls->be != NULL); switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; } tls->free(tls); break; case TCP_TLS_MODE_IFNET: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; } if (tls->snd_tag != NULL) m_snd_tag_rele(tls->snd_tag); break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, -1); break; } break; #endif } if (tls->params.auth_key != NULL) { zfree(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { zfree(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); } #if defined(INET) || defined(INET6) #ifdef TCP_OFFLOAD static int ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) { struct inpcb *inp; struct tcpcb *tp; int error; inp = so->so_pcb; INP_WLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); if (!(tp->t_flags & TF_TOE)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = tcp_offload_alloc_tls_session(tp, tls, direction); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, 1); break; } } return (error); } #endif /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); /* * Allocate a TLS + ratelimit tag if the connection has an * existing pacing rate. */ if (tp->t_pacing_rate != -1 && (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) { params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT; params.tls_rate_limit.inp = inp; params.tls_rate_limit.tls = tls; params.tls_rate_limit.max_rate = tp->t_pacing_rate; } else { params.hdr.type = IF_SND_TAG_TYPE_TLS; params.tls.inp = inp; params.tls.tls = tls; } params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; INP_RUNLOCK(inp); - if (ifp->if_snd_tag_alloc == NULL) { - error = EOPNOTSUPP; - goto out; - } if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } - error = ifp->if_snd_tag_alloc(ifp, ¶ms, mstp); + error = m_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) { struct m_snd_tag *mst; int error; error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (error == 0) { tls->mode = TCP_TLS_MODE_IFNET; tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; } } return (error); } static int ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction) { struct rm_priotracker prio; struct ktls_crypto_backend *be; /* * Choose the best software crypto backend. Backends are * stored in sorted priority order (larget value == most * important at the head of the list), so this just stops on * the first backend that claims the session by returning * success. */ if (ktls_allow_unload) rm_rlock(&ktls_backends_lock, &prio); LIST_FOREACH(be, &ktls_backends, next) { if (be->try(so, tls, direction) == 0) break; KASSERT(tls->cipher == NULL, ("ktls backend leaked a cipher pointer")); } if (be != NULL) { if (ktls_allow_unload) be->use_count++; tls->be = be; } if (ktls_allow_unload) rm_runlock(&ktls_backends_lock, &prio); if (be == NULL) return (EOPNOTSUPP); tls->mode = TCP_TLS_MODE_SW; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; } return (0); } /* * KTLS RX stores data in the socket buffer as a list of TLS records, * where each record is stored as a control message containg the TLS * header followed by data mbufs containing the decrypted data. This * is different from KTLS TX which always uses an mb_ext_pgs mbuf for * both encrypted and decrypted data. TLS records decrypted by a NIC * should be queued to the socket buffer as records, but encrypted * data which needs to be decrypted by software arrives as a stream of * regular mbufs which need to be converted. In addition, there may * already be pending encrypted data in the socket buffer when KTLS RX * is enabled. * * To manage not-yet-decrypted data for KTLS RX, the following scheme * is used: * * - A single chain of NOTREADY mbufs is hung off of sb_mtls. * * - ktls_check_rx checks this chain of mbufs reading the TLS header * from the first mbuf. Once all of the data for that TLS record is * queued, the socket is queued to a worker thread. * * - The worker thread calls ktls_decrypt to decrypt TLS records in * the TLS chain. Each TLS record is detached from the TLS chain, * decrypted, and inserted into the regular socket buffer chain as * record starting with a control message holding the TLS header and * a chain of mbufs holding the encrypted data. */ static void sb_mark_notready(struct sockbuf *sb) { struct mbuf *m; m = sb->sb_mb; sb->sb_mtls = m; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); m->m_flags |= M_NOTREADY; sb->sb_acc -= m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mtlstail = m; } KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, sb->sb_ccc)); } int ktls_enable_rx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_rcv.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS 1.3 is not yet supported. */ if (en->tls_vmajor == TLS_MAJOR_VER_ONE && en->tls_vminor == TLS_MINOR_VER_THREE) return (ENOTSUP); error = ktls_create_session(so, en, &tls); if (error) return (error); #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); if (error) #endif error = ktls_try_sw(so, tls, KTLS_RX); if (error) { ktls_cleanup(tls); return (error); } /* Mark the socket as using TLS offload. */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; so->so_rcv.sb_flags |= SB_TLS_RX; /* Mark existing data as not ready until it can be decrypted. */ sb_mark_notready(&so->so_rcv); ktls_check_rx(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; int error; if (!ktls_offload_enable) return (ENOTSUP); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls); if (error) return (error); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_TX); if (error) #endif error = ktls_try_ifnet(so, tls, false); if (error) error = ktls_try_sw(so, tls, KTLS_TX); if (error) { ktls_cleanup(tls); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); } /* * Write lock the INP when setting sb_tls_info so that * routines in tcp_ratelimit.c can read sb_tls_info while * holding the INP lock. */ inp = so->so_pcb; INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); sbunlock(&so->so_snd); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_rx_mode(struct socket *so) { struct ktls_session *tls; struct inpcb *inp; int mode; inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_rcv); tls = so->so_rcv.sb_tls_info; if (tls == NULL) mode = TCP_TLS_MODE_NONE; else mode = tls->mode; SOCKBUF_UNLOCK(&so->so_rcv); return (mode); } int ktls_get_tx_mode(struct socket *so) { struct ktls_session *tls; struct inpcb *inp; int mode; inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) mode = TCP_TLS_MODE_NONE; else mode = tls->mode; SOCKBUF_UNLOCK(&so->so_snd); return (mode); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; int error; switch (mode) { case TCP_TLS_MODE_SW: case TCP_TLS_MODE_IFNET: break; default: return (EINVAL); } inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if (tls->mode == mode) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else error = ktls_try_sw(so, tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); sbunlock(&so->so_snd); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); sbunlock(&so->so_snd); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); INP_WLOCK(inp); return (0); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(tp->t_vnet); tp = tcp_drop(tp, ECONNABORTED); CURVNET_RESTORE(); if (tp != NULL) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset_dropped, 1); } else INP_WUNLOCK(inp); } NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); in_pcbref(inp); tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; /* Can't get to the inp, but it should be locked. */ /* INP_LOCK_ASSERT(inp); */ MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag == NULL) { /* * Resetting send tag, ignore this change. The * pending reset may or may not see this updated rate * in the tcpcb. If it doesn't, we will just lose * this rate change. */ return (0); } MPASS(tls->snd_tag != NULL); MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT); mst = tls->snd_tag; ifp = mst->ifp; return (ifp->if_snd_tag_modify(mst, ¶ms)); } #endif #endif void ktls_destroy(struct ktls_session *tls) { struct rm_priotracker prio; ktls_cleanup(tls); if (tls->be != NULL && ktls_allow_unload) { rm_rlock(&ktls_backends_lock, &prio); tls->be->use_count--; rm_runlock(&ktls_backends_lock, &prio); } uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_seq: mapped mbuf %p", m)); m->m_epg_seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. To handle the * special case of empty fragments for TLS 1.0 sessions, an empty * fragment counts as one page. */ void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; uint64_t *noncep; uint16_t tls_len; int maxlen; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be TLS records whose * payload does not exceed the maximum frame length. * * Empty TLS records are permitted when using CBC. */ KASSERT(m->m_len <= maxlen && (tls->params.cipher_algorithm == CRYPTO_AES_CBC ? m->m_len >= 0 : m->m_len > 0), ("ktls_frame: m %p len %d\n", m, m->m_len)); /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top)); tls_len = m->m_len; /* Save a reference to the session. */ m->m_epg_tls = ktls_hold(tls); m->m_epg_hdrlen = tls->params.tls_hlen; m->m_epg_trllen = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.sb_tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); m->m_epg_trllen -= delta; } m->m_len += m->m_epg_hdrlen + m->m_epg_trllen; /* Populate the TLS header. */ tlshdr = (void *)m->m_epg_hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; /* * TLS 1.3 masquarades as TLS 1.2 with a record type * of TLS_RLTYPE_APP. */ if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { tlshdr->tls_vminor = TLS_MINOR_VER_TWO; tlshdr->tls_type = TLS_RLTYPE_APP; /* save the real record type for later */ m->m_epg_record_type = record_type; m->m_epg_trail[0] = record_type; } else { tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * Store nonces / explicit IVs after the end of the * TLS header. * * For GCM with TLS 1.2, an 8 byte nonce is copied * from the end of the IV. The nonce is then * incremented for use by the next record. * * For CBC, a random nonce is inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && tls->params.tls_vminor == TLS_MINOR_VER_TWO) { noncep = (uint64_t *)(tls->params.iv + 8); be64enc(tlshdr + 1, *noncep); (*noncep)++; } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->mode == TCP_TLS_MODE_SW) { m->m_flags |= M_NOTREADY; m->m_epg_nrdy = m->m_epg_npgs; if (__predict_false(tls_len == 0)) { /* TLS 1.0 empty fragment. */ *enq_cnt += 1; } else *enq_cnt += m->m_epg_npgs; } } } void ktls_check_rx(struct sockbuf *sb) { struct tls_record_layer hdr; struct ktls_wq *wq; struct socket *so; bool running; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); if (sb->sb_flags & SB_TLS_RX_RUNNING) return; /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < sizeof(hdr)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) so->so_error = EMSGSIZE; return; } m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); /* Is the entire record queued? */ if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0) so->so_error = EMSGSIZE; return; } sb->sb_flags |= SB_TLS_RX_RUNNING; soref(so); wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_rx_queued, 1); } static struct mbuf * ktls_detach_record(struct sockbuf *sb, int len) { struct mbuf *m, *n, *top; int remain; SOCKBUF_LOCK_ASSERT(sb); MPASS(len <= sb->sb_tlscc); /* * If TLS chain is the exact size of the record, * just grab the whole record. */ top = sb->sb_mtls; if (sb->sb_tlscc == len) { sb->sb_mtls = NULL; sb->sb_mtlstail = NULL; goto out; } /* * While it would be nice to use m_split() here, we need * to know exactly what m_split() allocates to update the * accounting, so do it inline instead. */ remain = len; for (m = top; remain > m->m_len; m = m->m_next) remain -= m->m_len; /* Easy case: don't have to split 'm'. */ if (remain == m->m_len) { sb->sb_mtls = m->m_next; if (sb->sb_mtls == NULL) sb->sb_mtlstail = NULL; m->m_next = NULL; goto out; } /* * Need to allocate an mbuf to hold the remainder of 'm'. Try * with M_NOWAIT first. */ n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { /* * Use M_WAITOK with socket buffer unlocked. If * 'sb_mtls' changes while the lock is dropped, return * NULL to force the caller to retry. */ SOCKBUF_UNLOCK(sb); n = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (sb->sb_mtls != top) { m_free(n); return (NULL); } } n->m_flags |= M_NOTREADY; /* Store remainder in 'n'. */ n->m_len = m->m_len - remain; if (m->m_flags & M_EXT) { n->m_data = m->m_data + remain; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); } /* Trim 'm' and update accounting. */ m->m_len -= n->m_len; sb->sb_tlscc -= n->m_len; sb->sb_ccc -= n->m_len; /* Account for 'n'. */ sballoc_ktls_rx(sb, n); /* Insert 'n' into the TLS chain. */ sb->sb_mtls = n; n->m_next = m->m_next; if (sb->sb_mtlstail == m) sb->sb_mtlstail = n; /* Detach the record from the TLS chain. */ m->m_next = NULL; out: MPASS(m_length(top, NULL) == len); for (m = top; m != NULL; m = m->m_next) sbfree_ktls_rx(sb, m); sb->sb_tlsdcc = len; sb->sb_ccc += len; SBCHECK(sb); return (top); } static void ktls_decrypt(struct socket *so) { char tls_header[MBUF_PEXT_HDR_LEN]; struct ktls_session *tls; struct sockbuf *sb; struct tls_record_layer *hdr; struct tls_get_record tgr; struct mbuf *control, *data, *m; uint64_t seqno; int error, remain, tls_len, trail_len; hdr = (struct tls_record_layer *)tls_header; sb = &so->so_rcv; SOCKBUF_LOCK(sb); KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, ("%s: socket %p not running", __func__, so)); tls = sb->sb_tls_info; MPASS(tls != NULL); for (;;) { /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < tls->params.tls_hlen) break; m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); if (hdr->tls_vmajor != tls->params.tls_vmajor || hdr->tls_vminor != tls->params.tls_vminor) error = EINVAL; else if (tls_len < tls->params.tls_hlen || tls_len > tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + tls->params.tls_tlen) error = EMSGSIZE; else error = 0; if (__predict_false(error != 0)) { /* * We have a corrupted record and are likely * out of sync. The connection isn't * recoverable at this point, so abort it. */ SOCKBUF_UNLOCK(sb); counter_u64_add(ktls_offload_corrupted_records, 1); CURVNET_SET(so->so_vnet); so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = error; CURVNET_RESTORE(); goto deref; } /* Is the entire record queued? */ if (sb->sb_tlscc < tls_len) break; /* * Split out the portion of the mbuf chain containing * this TLS record. */ data = ktls_detach_record(sb, tls_len); if (data == NULL) continue; MPASS(sb->sb_tlsdcc == tls_len); seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; SBCHECK(sb); SOCKBUF_UNLOCK(sb); error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* * sbcut/drop/flush discarded these * mbufs. */ m_freem(data); break; } /* * Drop this TLS record's data, but keep * decrypting subsequent records. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; CURVNET_SET(so->so_vnet); so->so_error = EBADMSG; sorwakeup_locked(so); CURVNET_RESTORE(); m_freem(data); SOCKBUF_LOCK(sb); continue; } /* Allocate the control mbuf. */ tgr.tls_type = hdr->tls_type; tgr.tls_vmajor = hdr->tls_vmajor; tgr.tls_vminor = hdr->tls_vminor; tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - trail_len); control = sbcreatecontrol_how(&tgr, sizeof(tgr), TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* sbcut/drop/flush discarded these mbufs. */ MPASS(sb->sb_tlscc == 0); m_freem(data); m_freem(control); break; } /* * Clear the 'dcc' accounting in preparation for * adding the decrypted record. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; SBCHECK(sb); /* If there is no payload, drop all of the data. */ if (tgr.tls_length == htobe16(0)) { m_freem(data); data = NULL; } else { /* Trim header. */ remain = tls->params.tls_hlen; while (remain > 0) { if (data->m_len > remain) { data->m_data += remain; data->m_len -= remain; break; } remain -= data->m_len; data = m_free(data); } /* Trim trailer and clear M_NOTREADY. */ remain = be16toh(tgr.tls_length); m = data; for (m = data; remain > m->m_len; m = m->m_next) { m->m_flags &= ~M_NOTREADY; remain -= m->m_len; } m->m_len = remain; m_freem(m->m_next); m->m_next = NULL; m->m_flags &= ~M_NOTREADY; /* Set EOR on the final mbuf. */ m->m_flags |= M_EOR; } sbappendcontrol_locked(sb, data, control, 0); } sb->sb_flags &= ~SB_TLS_RX_RUNNING; if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) so->so_error = EMSGSIZE; sorwakeup_locked(so); deref: SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_SET(so->so_vnet); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void ktls_enqueue_to_free(struct mbuf *m) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct ktls_wq *wq; bool running; KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf")); m->m_epg_enc_cnt = page_count; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ m->m_epg_so = so; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_tx_queued, 1); } static __noinline void ktls_encrypt(struct mbuf *top) { struct ktls_session *tls; struct socket *so; struct mbuf *m; vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; vm_page_t pg; int error, i, len, npages, off, total_pages; bool is_anon; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY), ("%p not unready & nomap mbuf (top = %p)\n", m, top)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); /* * Generate source and destination ivoecs to pass to * the SW encryption backend. For writable mbufs, the * destination iovec is a copy of the source and * encryption is done in place. For file-backed mbufs * (from sendfile), anonymous wired pages are * allocated and assigned to the destination iovec. */ is_anon = (m->m_epg_flags & EPG_FLAG_ANON) != 0; off = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs; i++, off = 0) { len = m_epg_pagelen(m, i, off); src_iov[i].iov_len = len; src_iov[i].iov_base = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) + off; if (is_anon) { dst_iov[i].iov_base = src_iov[i].iov_base; dst_iov[i].iov_len = src_iov[i].iov_len; continue; } retry_page: pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) { vm_wait(NULL); goto retry_page; } parray[i] = VM_PAGE_TO_PHYS(pg); dst_iov[i].iov_base = (char *)(void *)PHYS_TO_DMAP(parray[i]) + off; dst_iov[i].iov_len = len; } if (__predict_false(m->m_epg_npgs == 0)) { /* TLS 1.0 empty fragment. */ npages++; } else npages += i; error = (*tls->sw_encrypt)(tls, (const struct tls_record_layer *)m->m_epg_hdr, m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno, m->m_epg_record_type); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } /* * For file-backed mbufs, release the file-backed * pages and replace them in the ext_pgs array with * the anonymous wired pages allocated above. */ if (!is_anon) { /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; } /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ m->m_epg_tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(top, total_pages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; struct socket *so, *son; STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->m_head) && STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_m_head); STAILQ_CONCAT(&local_m_head, &wq->m_head); STAILQ_INIT(&local_so_head); STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); uma_zfree(zone_mbuf, m); } else { ktls_encrypt(m); counter_u64_add(ktls_cnt_tx_queued, -1); } } STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { ktls_decrypt(so); counter_u64_add(ktls_cnt_rx_queued, -1); } } } diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index dee4a5be6cca..87019dd4c321 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -1,2622 +1,2622 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif #define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RLOCK_ASSERT() NET_EPOCH_ASSERT() #define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch(net_epoch_preempt)) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; struct lagg_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *); static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #if defined(KERN_TLS) || defined(RATELIMIT) static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int lagg_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int lagg_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); static void lagg_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static int lagg_setcaps(struct lagg_port *, int cap); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit_ethernet(struct ifnet *, struct mbuf *); static int lagg_transmit_infiniband(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Link Aggregation"); /* Allow input on any failover links */ VNET_DEFINE_STATIC(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ VNET_DEFINE_STATIC(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for using numa */ VNET_DEFINE_STATIC(int, def_use_numa) = 1; #define V_def_use_numa VNET(def_use_numa) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, &VNET_NAME(def_use_numa), 0, "Use numa to steer flows"); /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_ethernet_p = lagg_input_ethernet; lagg_input_infiniband_p = lagg_input_infiniband; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_ethernet_p = NULL; lagg_input_infiniband_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct iflaggparam iflp; struct lagg_softc *sc; struct ifnet *ifp; int if_type; int error; static const uint8_t eaddr[LAGG_ADDR_LEN]; static const uint8_t ib_bcast_addr[INFINIBAND_ADDR_LEN] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; if (params != NULL) { error = copyin(params, &iflp, sizeof(iflp)); if (error) return (error); switch (iflp.lagg_type) { case LAGG_TYPE_ETHERNET: if_type = IFT_ETHER; break; case LAGG_TYPE_INFINIBAND: if_type = IFT_INFINIBAND; break; default: return (EINVAL); } } else { if_type = IFT_ETHER; } sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(if_type); if (ifp == NULL) { free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF); callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; if (V_def_use_numa) sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); CK_SLIST_INIT(&sc->sc_ports); switch (if_type) { case IFT_ETHER: /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_transmit = lagg_transmit_ethernet; break; case IFT_INFINIBAND: if_initname(ifp, laggname, unit); ifp->if_transmit = lagg_transmit_infiniband; break; default: break; } ifp->if_softc = sc; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; ifp->if_snd_tag_modify = lagg_snd_tag_modify; ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; ifp->if_ratelimit_query = lagg_ratelimit_query; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG. */ switch (if_type) { case IFT_ETHER: ether_ifattach(ifp, eaddr); break; case IFT_INFINIBAND: infiniband_ifattach(ifp, eaddr, ib_bcast_addr); break; default: break; } sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); switch (ifp->if_type) { case IFT_ETHER: ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); break; case IFT_INFINIBAND: infiniband_ifdetach(ifp); break; default: break; } if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); mtx_destroy(&sc->sc_mtx); LAGG_SX_DESTROY(sc); free(sc, M_LAGG); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, ena, pena; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ena &= lp->lp_ifp->if_capenable; ena = (ena == ~0 ? 0 : ena); /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena); ena &= lp->lp_ifp->if_capenable; } } while (pena != ena); /* Get other capabilities from the lagg ports */ cap = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); hwa = (hwa == ~(uint64_t)0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; struct ifreq ifr; int error, i, oldmtu; int if_type; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); if (sc->sc_ifp == ifp) { if_printf(sc->sc_ifp, "cannot add a lagg to itself as a port\n"); return (EINVAL); } if (sc->sc_destroying == 1) return (ENXIO); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } switch (sc->sc_ifp->if_type) { case IFT_ETHER: /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); if_type = IFT_IEEE8023ADLAG; break; case IFT_INFINIBAND: /* XXX Disallow non-infiniband interfaces */ if (ifp->if_type != IFT_INFINIBAND) return (EPROTONOSUPPORT); if_type = IFT_INFINIBANDLAG; break; default: break; } /* Allow the first Ethernet member to define the MTU */ oldmtu = -1; if (CK_SLIST_EMPTY(&sc->sc_ports)) { sc->sc_ifp->if_mtu = ifp->if_mtu; } else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if (ifp->if_ioctl == NULL) { if_printf(sc->sc_ifp, "cannot change MTU for %s\n", ifp->if_xname); return (EINVAL); } oldmtu = ifp->if_mtu; strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (error); } ifr.ifr_mtu = oldmtu; } lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); } lagg_setflags(lp, 1); if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = if_type; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index > ifp->if_index)) break; } if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_LAGG); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[LAGG_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, LAGG_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, lp_ptr->lp_ifp->if_addrlen); } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ switch (ifp->if_type) { case IFT_IEEE8023ADLAG: case IFT_INFINIBANDLAG: if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; break; default: goto fallback; } switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } LAGG_RUNLOCK(); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_watchdog_infiniband(void *arg) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *ifp; struct ifnet *lp_ifp; sc = arg; /* * Because infiniband nodes have a fixed MAC address, which is * generated by the so-called GID, we need to regularly update * the link level address of the parent lagg device when * the active port changes. Possibly we could piggy-back on * link up/down events aswell, but using a timer also provides * a guarantee against too frequent events. This operation * does not have to be atomic. */ LAGG_RLOCK(); lp = lagg_link_active(sc, sc->sc_primary); if (lp != NULL) { ifp = sc->sc_ifp; lp_ifp = lp->lp_ifp; if (ifp != NULL && lp_ifp != NULL && memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0) { memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen); CURVNET_SET(ifp->if_vnet); EVENTHANDLER_INVOKE(iflladdr_event, ifp); CURVNET_RESTORE(); } } LAGG_RUNLOCK(); callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg); } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ifp->if_addrlen) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen); } lagg_proto_init(sc); if (ifp->if_type == IFT_INFINIBAND) { mtx_lock(&sc->sc_mtx); lagg_watchdog_infiniband(sc); mtx_unlock(&sc->sc_mtx); } LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); mtx_lock(&sc->sc_mtx); callout_stop(&sc->sc_watchdog); mtx_unlock(&sc->sc_mtx); callout_drain(&sc->sc_watchdog); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0, oldmtu; bzero(&rpbuf, sizeof(rpbuf)); /* XXX: This can race with lagg_clone_destroy. */ switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } /* Infiniband only supports the failover protocol. */ if (ra->ra_proto != LAGG_PROTO_FAILOVER && ifp->if_type == IFT_INFINIBAND) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_stride; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: error = priv_check(td, PRIV_NET_LAGG); if (error) break; /* * The stride option was added without defining a corresponding * LAGG_OPT flag, so handle a non-zero value before checking * anything else to preserve compatibility. */ LAGG_XLOCK(sc); if (ro->ro_opts == 0 && ro->ro_bkt != 0) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) { LAGG_XUNLOCK(sc); error = EINVAL; break; } sc->sc_stride = ro->ro_bkt; } if (ro->ro_opts == 0) { LAGG_XUNLOCK(sc); break; } /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_USE_NUMA: case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: case LAGG_OPT_RR_LIMIT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_FAST_TIMO: case -LAGG_OPT_LACP_FAST_TIMO: valid = lacp = 1; break; default: valid = lacp = 0; break; } if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT, RR and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN || ro->ro_bkt == 0) { error = EINVAL; LAGG_XUNLOCK(sc); break; } sc->sc_stride = ro->ro_bkt; } else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_FAST_TIMO: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_FAST_TIMO: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); if_rele(tpif); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif oldmtu = ifp->if_mtu; LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); /* * LAGG MTU may change during addition of the first port. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: if (ifp->if_type == IFT_INFINIBAND) error = EINVAL; else error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCSIFMTU: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); else error = EINVAL; if (error != 0) { if_printf(ifp, "failed to change MTU to %d on port %s, " "reverting all ports to original MTU (%d)\n", ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu); break; } } if (error == 0) { ifp->if_mtu = ifr->ifr_mtu; } else { /* set every port back to the original MTU */ ifr->ifr_mtu = ifp->if_mtu; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } } LAGG_XUNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static inline struct lagg_snd_tag * mst_to_lst(struct m_snd_tag *mst) { return (__containerof(mst, struct lagg_snd_tag, com)); } /* * Look up the port used by a specific flow. This only works for lagg * protocols with deterministic port mappings (e.g. not roundrobin). * In addition protocols which use a hash to map flows to ports must * be configured to use the mbuf flowid rather than hashing packet * contents. */ static struct lagg_port * lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype, uint8_t numa_domain) { struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t hash, p; sc = ifp->if_softc; switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); hash = flowid >> sc->flowid_shift; return (lacp_select_tx_port_by_hash(sc, hash, numa_domain)); default: return (NULL); } } static int lagg_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct lagg_snd_tag *lst; struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lp_ifp; int error; sc = ifp->if_softc; LAGG_RLOCK(); lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype, params->hdr.numa_domain); if (lp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } - if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { + if (lp->lp_ifp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } lp_ifp = lp->lp_ifp; if_ref(lp_ifp); LAGG_RUNLOCK(); lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); if (lst == NULL) { if_rele(lp_ifp); return (ENOMEM); } - error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); + error = m_snd_tag_alloc(lp_ifp, params, &lst->tag); if_rele(lp_ifp); if (error) { free(lst, M_LAGG); return (error); } m_snd_tag_init(&lst->com, ifp, lst->tag->type); *ppmt = &lst->com; return (0); } static int lagg_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); } static int lagg_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); } static void lagg_snd_tag_free(struct m_snd_tag *mst) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); m_snd_tag_rele(lst->tag); free(lst, M_LAGG); } static void lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * For lagg, we have an indirect * interface. The caller needs to * get a ratelimit tag on the actual * interface the flow will go on. */ q->rate_table = NULL; q->flags = RT_IS_INDIRECT; q->max_flows = 0; q->number_of_rates = 0; } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_LAGG); } return (0); } static int lagg_setcaps(struct lagg_port *lp, int cap) { struct ifreq ifr; if (lp->lp_ifp->if_capenable == cap) return (0); if (lp->lp_ioctl == NULL) return (ENXIO); ifr.ifr_reqcap = cap; return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr)); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); return (error); } static int lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } INFINIBAND_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } LAGG_RUNLOCK(); return (m); } static struct mbuf * lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } INFINIBAND_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } LAGG_RUNLOCK(); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* LACP handles link state itself */ if (sc->sc_proto == LAGG_PROTO_LACP) return; /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } LAGG_RUNLOCK(); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; LAGG_RUNLOCK(); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; /* * Search a port which reports an active link state. */ #ifdef INVARIANTS /* * This is called with either LAGG_RLOCK() held or * LAGG_XLOCK(sc) held. */ if (!in_epoch(net_epoch_preempt)) LAGG_XLOCK_ASSERT(sc); #endif if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { return (lp_next); } } found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct lagg_snd_tag *lst; struct m_snd_tag *mst; mst = m->m_pkthdr.snd_tag; lst = mst_to_lst(mst); if (lst->tag->ifp != ifp) { m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); m_snd_tag_rele(mst); } #endif return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_stride = 1; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; p = atomic_fetchadd_32(&sc->sc_seq, 1); p /= sc->sc_stride; p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); while (p--) lp = CK_SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; LAGG_RLOCK_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } lagg_enqueue(last->lp_ifp, m0); } last = lp; } if (last == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { errors++; if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (errors != 0) if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); return (ret); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_LAGG); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0, rv; rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) { rv = EINVAL; break; } if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } return (rv); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 920c65323ae7..9358724844cb 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1,2105 +1,2105 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; #if defined(KERN_TLS) || defined(RATELIMIT) struct vlan_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; static inline struct vlan_snd_tag * mst_to_vst(struct m_snd_tag *mst) { return (__containerof(mst, struct vlan_snd_tag, com)); } #endif /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) (TRUNK(ifv)->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ struct ether_8021q_tag ifv_qtag; #define ifv_proto ifv_qtag.proto #define ifv_vid ifv_qtag.vid #define ifv_pcp ifv_qtag.pcp struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; extern int vlan_mtag_pcp; static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE) #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int vlan_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int vlan_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag, uint16_t proto); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) return (error); } CURVNET_RESTORE(); return (0); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; return (ifp); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { if_clone_detach(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_vlan_uninit, NULL); #endif /* * Check for .[. ...] style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strrchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { struct ifnet *ifp; const char *cp; ifp = vlan_clone_match_ethervid(name, NULL); if (ifp != NULL) { if_rele(ifp); return (1); } if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int error; int vid; uint16_t proto; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ proto = ETHERTYPE_VLAN; /* * There are two ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter is supported * for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); if (error) return error; p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); error = ifc_name2unit(name, &unit); if (error != 0) { if_rele(p); return (error); } vid = vlr.vlr_tag; proto = vlr.vlr_proto; wildcard = (unit < 0); } else { p = NULL; error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); } error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; ifp->if_snd_tag_modify = vlan_snd_tag_modify; ifp->if_snd_tag_query = vlan_snd_tag_query; ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid, proto); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } return (0); } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifvlan *ifv = ifp->if_softc; if (ifp->if_vlantrunk) return (EBUSY); ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, ifp->if_dunit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; NET_EPOCH_ASSERT(); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct vlan_snd_tag *vst; struct m_snd_tag *mst; MPASS(m->m_pkthdr.snd_tag->ifp == ifp); mst = m->m_pkthdr.snd_tag; vst = mst_to_vst(mst); if (vst->tag->ifp != p) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); m_snd_tag_rele(mst); } #endif /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ifvlan *ifv; struct ifnet *p; NET_EPOCH_ASSERT(); /* * Find the first non-VLAN parent interface. */ ifv = ifp->if_softc; do { if (TRUNK(ifv) == NULL) { m_freem(m); return (ENETDOWN); } p = PARENT(ifv); ifv = p->if_softc; } while (p->if_type == IFT_L2VLAN); return p->if_output(ifp, m, dst, ro); } /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid, uint16_t proto) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && p->if_type != IFT_L2VLAN && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) return (EBUSY); VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = proto; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * We wrap the parent's if_output using vlan_output to ensure that it * can't become stale. */ ifp->if_output = vlan_output; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; /* * Do not schedule link address update if it was the same * as previous parent's. This helps avoid updating for each * associated llentry. */ if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) { bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; NET_EPOCH_ASSERT(); VLAN_SXLOCK_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= p->if_capenable & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif /* * If the parent interface supports unmapped mbufs, so does * the VLAN interface. Note that this should be fine even for * interfaces that don't support hardware tagging as headers * are prepended in normal mbufs to unmapped mbufs holding * payload data. */ cap |= (p->if_capabilities & IFCAP_NOMAP); ena |= (mena & IFCAP_NOMAP); /* * If the parent interface can offload encryption and segmentation * of TLS records over TCP, propagate it's capability to the VLAN * interface. * * All TLS drivers in the tree today can deal with VLANs. If * this ever changes, then a new IFCAP_VLAN_TXTLS can be * defined. */ if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) vlan_capabilities(ifv); NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0, oldmtu; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } oldmtu = ifp->if_mtu; error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto); if_rele(p); /* * VLAN MTU may change during addition of the vlandev. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; vlr.vlr_proto = ifv->ifv_proto; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_XLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_XUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > 7) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; struct vlan_snd_tag *vst; struct ifvlan *ifv; struct ifnet *parent; int error; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (ifv->ifv_trunk != NULL) parent = PARENT(ifv); else parent = NULL; - if (parent == NULL || parent->if_snd_tag_alloc == NULL) { + if (parent == NULL) { NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } if_ref(parent); NET_EPOCH_EXIT(et); vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); if (vst == NULL) { if_rele(parent); return (ENOMEM); } - error = parent->if_snd_tag_alloc(parent, params, &vst->tag); + error = m_snd_tag_alloc(parent, params, &vst->tag); if_rele(parent); if (error) { free(vst, M_VLAN); return (error); } m_snd_tag_init(&vst->com, ifp, vst->tag->type); *ppmt = &vst->com; return (0); } static int vlan_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); } static int vlan_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); } static void vlan_snd_tag_free(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); m_snd_tag_rele(vst->tag); free(vst, M_VLAN); } #endif diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index a8255a04ae6a..03cd09fb448d 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,3526 +1,3521 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #ifdef INET #include #include #endif #include #include #ifdef TCPHPTS #include #endif #include #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #endif #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #ifdef RATELIMIT counter_u64_t rate_limit_active; counter_u64_t rate_limit_alloc_fail; counter_u64_t rate_limit_set_ok; static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "IP Rate Limiting"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, &rate_limit_active, "Active rate limited connections"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, &rate_limit_alloc_fail, "Rate limited connection failures"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, &rate_limit_set_ok, "Rate limited setting succeeded"); #endif /* RATELIMIT */ #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (!grp) return (NULL); grp->il_vflag = vflag; grp->il_lport = port; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Don't allow jailed socket to join local group. */ if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) return (0); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) break; } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } return; } } } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif #if defined(INET) || defined(INET6) /* * Assign a local port like in_pcb_lport(), but also used with connect() * and a foreign address and port. If fsa is non-NULL, choose a local port * that is unused with those, otherwise one that is completely unused. * lsa can be NULL for IPv6. */ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr, faddr; #endif #ifdef INET6 struct in6_addr *laddr6, *faddr6; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET laddr.s_addr = INADDR_ANY; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { if (lsa != NULL) laddr = ((struct sockaddr_in *)lsa)->sin_addr; if (fsa != NULL) faddr = ((struct sockaddr_in *)fsa)->sin_addr; } #endif #ifdef INET6 laddr6 = NULL; if ((inp->inp_vflag & INP_IPV6) != 0) { if (lsa != NULL) laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; if (fsa != NULL) faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; } #endif tmpinp = NULL; lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); if (fsa != NULL) { #ifdef INET if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, NULL); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, NULL); } #endif } else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } } while (tmpinp != NULL); *lportp = lport; return (0); } /* * Select a local port (number) to use. */ int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct sockaddr_in laddr; if (laddrp) { bzero(&laddr, sizeof(laddr)); laddr.sin_family = AF_INET; laddr.sin_addr = *laddrp; } return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : NULL, lportp, NULL, 0, cred, lookupflags)); } /* * Return cached socket options. */ int inp_so_options(const struct inpcb *inp) { int so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) so_options |= SO_REUSEPORT_LB; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || ((reuseport & tw->tw_so_options) == 0 && (reuseport_lb & tw->tw_so_options) == 0)) { return (EADDRINUSE); } } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m, bool rehash) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { KASSERT(rehash == true, ("Rehashing required for unbound inps")); inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; if (rehash) { in_pcbrehash_mbuf(inp, m); } else { in_pcbinshash_mbuf(inp, m); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin, dst; struct nhop_object *nh; int error; NET_EPOCH_ASSERT(); KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; nh = NULL; bzero(&dst, sizeof(dst)); sin = &dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 0, NHR_NONE, 0); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (nh == NULL || nh->nh_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = nh->nh_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { struct in_ifaddr *ia; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ NET_EPOCH_ASSERT(); INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_val, hash_type; hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, inp->inp_socket->so_proto->pr_protocol, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } if (lport != 0) { oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } } else { struct sockaddr_in lsin, fsin; bzero(&lsin, sizeof(lsin)); bzero(&fsin, sizeof(fsin)); lsin.sin_family = AF_INET; lsin.sin_addr = laddr; fsin.sin_family = AF_INET; fsin.sin_addr = faddr; error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, &lport, (struct sockaddr *)& fsin, fport, cred, INPLOOKUP_WILDCARD); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } void in_pcblist_rele_rlocked(epoch_context_t ctx) { struct in_pcblist *il; struct inpcb *inp; struct inpcbinfo *pcbinfo; int i, n; il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); pcbinfo = il->il_pcbinfo; n = il->il_count; INP_INFO_WLOCK(pcbinfo); for (i = 0; i < n; i++) { inp = il->il_inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); free(il, M_TEMP); } static void inpcbport_free(epoch_context_t ctx) { struct inpcbport *phd; phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); free(phd, M_PCB); } static void in_pcbfree_deferred(epoch_context_t ctx) { struct inpcb *inp; int released __unused; inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); INP_WLOCK(inp); CURVNET_SET(inp->inp_vnet); #ifdef INET struct ip_moptions *imo = inp->inp_moptions; inp->inp_moptions = NULL; #endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET6 struct ip6_moptions *im6o = NULL; if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); im6o = inp->in6p_moptions; inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); inp->inp_vflag = 0; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif released = in_pcbrele_wlocked(inp); MPASS(released); #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif CURVNET_RESTORE(); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK(pcbinfo); in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); RO_INVALIDATE_CACHE(&inp->inp_route); /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; INP_WUNLOCK(inp); NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); #ifdef INVARIANTS if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) MPASS(inp->inp_refcount > 1); #endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ restart: IP_MFILTER_FOREACH(imf, &imo->imo_head) { if ((inm = imf->imf_inm) == NULL) continue; if (inm->inm_ifp != ifp) continue; ip_mfilter_remove(&imo->imo_head, imf); IN_MULTI_LOCK_ASSERT(); in_leavegroup_locked(inm, NULL); ip_mfilter_free(imf); goto restart; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, uint16_t fport, int lookupflags) { struct inpcb *local_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Order of socket selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: * - Load balanced group does not contain jailed sockets * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % grp->il_inpcnt; if (grp->il_laddr.s_addr == laddr->s_addr) return (grp->il_inp[idx]); if (grp->il_laddr.s_addr == INADDR_ANY && (lookupflags & INPLOOKUP_WILDCARD) != 0) local_wild = grp->il_inp[idx]; } return (local_wild); } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (NULL); } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look in lb group (for wildcard match). */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags); if (inp != NULL) return (inp); } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_WUNLOCK(inp); inp = NULL; } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); inp = NULL; } } else panic("%s: locking bug", __func__); #ifdef INVARIANTS if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); } #endif } return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { int ret = in_pcbinslbgrouphash(inp); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); } } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (m != NULL) { in_pcbgroup_update_mbuf(inp, m); } else { in_pcbgroup_update(inp); } #endif return (0); } int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, NULL)); } int in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m) { return (in_pcbinshash_internal(inp, m)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); /* XXX: Only do if SO_REUSEPORT_LB set? */ in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); if (*st != NULL) return (EINVAL); - if (ifp->if_snd_tag_alloc == NULL) { - error = EOPNOTSUPP; - } else { - error = ifp->if_snd_tag_alloc(ifp, ¶ms, st); - + error = m_snd_tag_alloc(ifp, ¶ms, st); #ifdef INET - if (error == 0) { - counter_u64_add(rate_limit_set_ok, 1); - counter_u64_add(rate_limit_active, 1); - } else - counter_u64_add(rate_limit_alloc_fail, 1); + if (error == 0) { + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else if (error != EOPNOTSUPP) + counter_u64_add(rate_limit_alloc_fail, 1); #endif - } return (error); } void in_pcbdetach_tag(struct m_snd_tag *mst) { m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; m_snd_tag_rele(mst); } int in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { int error; /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new * tag if we fail to allocate one this time. */ if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { in_pcbdetach_txrtlmt(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; return (error); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #ifdef INET static void rl_init(void *st) { rate_limit_active = counter_u64_alloc(M_WAITOK); rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); rate_limit_set_ok = counter_u64_alloc(M_WAITOK); } SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif #endif /* RATELIMIT */ diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c index e7c4aaa5bd1c..97f066be69e0 100644 --- a/sys/netinet/tcp_ratelimit.c +++ b/sys/netinet/tcp_ratelimit.c @@ -1,1557 +1,1547 @@ /*- * * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2018-2020 * Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * Author: Randall Stewart */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #ifdef INET6 #include #endif #include #ifndef USECS_IN_SECOND #define USECS_IN_SECOND 1000000 #endif /* * For the purposes of each send, what is the size * of an ethernet frame. */ MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); #ifdef RATELIMIT /* * The following preferred table will seem weird to * the casual viewer. Why do we not have any rates below * 1Mbps? Why do we have a rate at 1.44Mbps called common? * Why do the rates cluster in the 1-100Mbps range more * than others? Why does the table jump around at the beginnign * and then be more consistently raising? * * Let me try to answer those questions. A lot of * this is dependant on the hardware. We have three basic * supporters of rate limiting * * Chelsio - Supporting 16 configurable rates. * Mlx - c4 supporting 13 fixed rates. * Mlx - c5 & c6 supporting 127 configurable rates. * * The c4 is why we have a common rate that is available * in all rate tables. This is a selected rate from the * c4 table and we assure its available in all ratelimit * tables. This way the tcp_ratelimit code has an assured * rate it should always be able to get. This answers a * couple of the questions above. * * So what about the rest, well the table is built to * try to get the most out of a joint hardware/software * pacing system. The software pacer will always pick * a rate higher than the b/w that it is estimating * * on the path. This is done for two reasons. * a) So we can discover more b/w * and * b) So we can send a block of MSS's down and then * have the software timer go off after the previous * send is completely out of the hardware. * * But when we do we don't want to have the delay * between the last packet sent by the hardware be * excessively long (to reach our desired rate). * * So let me give an example for clarity. * * Lets assume that the tcp stack sees that 29,110,000 bps is * what the bw of the path is. The stack would select the * rate 31Mbps. 31Mbps means that each send that is done * by the hardware will cause a 390 micro-second gap between * the packets sent at that rate. For 29,110,000 bps we * would need 416 micro-seconds gap between each send. * * Note that are calculating a complete time for pacing * which includes the ethernet, IP and TCP overhead. So * a full 1514 bytes is used for the above calculations. * My testing has shown that both cards are also using this * as their basis i.e. full payload size of the ethernet frame. * The TCP stack caller needs to be aware of this and make the * appropriate overhead calculations be included in its choices. * * Now, continuing our example, we pick a MSS size based on the * delta between the two rates (416 - 390) divided into the rate * we really wish to send at rounded up. That results in a MSS * send of 17 mss's at once. The hardware then will * run out of data in a single 17MSS send in 6,630 micro-seconds. * * On the other hand the software pacer will send more data * in 7,072 micro-seconds. This means that we will refill * the hardware 52 microseconds after it would have sent * next if it had not ran out of data. This is a win since we are * only sending every 7ms or so and yet all the packets are spaced on * the wire with 94% of what they should be and only * the last packet is delayed extra to make up for the * difference. * * Note that the above formula has two important caveat. * If we are above (b/w wise) over 100Mbps we double the result * of the MSS calculation. The second caveat is if we are 500Mbps * or more we just send the maximum MSS at once i.e. 45MSS. At * the higher b/w's even the cards have limits to what times (timer granularity) * they can insert between packets and start to send more than one * packet at a time on the wire. * */ #define COMMON_RATE 180500 const uint64_t desired_rates[] = { 122500, /* 1Mbps - rate 1 */ 180500, /* 1.44Mpbs - rate 2 common rate */ 375000, /* 3Mbps - rate 3 */ 625000, /* 5Mbps - rate 4 */ 875000, /* 7Mbps - rate 5 */ 1125000, /* 9Mbps - rate 6 */ 1375000, /* 11Mbps - rate 7 */ 1625000, /* 13Mbps - rate 8 */ 2625000, /* 21Mbps - rate 9 */ 3875000, /* 31Mbps - rate 10 */ 5125000, /* 41Meg - rate 11 */ 12500000, /* 100Mbps - rate 12 */ 25000000, /* 200Mbps - rate 13 */ 50000000, /* 400Mbps - rate 14 */ 63750000, /* 51Mbps - rate 15 */ 100000000, /* 800Mbps - rate 16 */ 1875000, /* 15Mbps - rate 17 */ 2125000, /* 17Mbps - rate 18 */ 2375000, /* 19Mbps - rate 19 */ 2875000, /* 23Mbps - rate 20 */ 3125000, /* 25Mbps - rate 21 */ 3375000, /* 27Mbps - rate 22 */ 3625000, /* 29Mbps - rate 23 */ 4125000, /* 33Mbps - rate 24 */ 4375000, /* 35Mbps - rate 25 */ 4625000, /* 37Mbps - rate 26 */ 4875000, /* 39Mbps - rate 27 */ 5375000, /* 43Mbps - rate 28 */ 5625000, /* 45Mbps - rate 29 */ 5875000, /* 47Mbps - rate 30 */ 6125000, /* 49Mbps - rate 31 */ 6625000, /* 53Mbps - rate 32 */ 6875000, /* 55Mbps - rate 33 */ 7125000, /* 57Mbps - rate 34 */ 7375000, /* 59Mbps - rate 35 */ 7625000, /* 61Mbps - rate 36 */ 7875000, /* 63Mbps - rate 37 */ 8125000, /* 65Mbps - rate 38 */ 8375000, /* 67Mbps - rate 39 */ 8625000, /* 69Mbps - rate 40 */ 8875000, /* 71Mbps - rate 41 */ 9125000, /* 73Mbps - rate 42 */ 9375000, /* 75Mbps - rate 43 */ 9625000, /* 77Mbps - rate 44 */ 9875000, /* 79Mbps - rate 45 */ 10125000, /* 81Mbps - rate 46 */ 10375000, /* 83Mbps - rate 47 */ 10625000, /* 85Mbps - rate 48 */ 10875000, /* 87Mbps - rate 49 */ 11125000, /* 89Mbps - rate 50 */ 11375000, /* 91Mbps - rate 51 */ 11625000, /* 93Mbps - rate 52 */ 11875000, /* 95Mbps - rate 53 */ 13125000, /* 105Mbps - rate 54 */ 13750000, /* 110Mbps - rate 55 */ 14375000, /* 115Mbps - rate 56 */ 15000000, /* 120Mbps - rate 57 */ 15625000, /* 125Mbps - rate 58 */ 16250000, /* 130Mbps - rate 59 */ 16875000, /* 135Mbps - rate 60 */ 17500000, /* 140Mbps - rate 61 */ 18125000, /* 145Mbps - rate 62 */ 18750000, /* 150Mbps - rate 64 */ 20000000, /* 160Mbps - rate 65 */ 21250000, /* 170Mbps - rate 66 */ 22500000, /* 180Mbps - rate 67 */ 23750000, /* 190Mbps - rate 68 */ 26250000, /* 210Mbps - rate 69 */ 27500000, /* 220Mbps - rate 70 */ 28750000, /* 230Mbps - rate 71 */ 30000000, /* 240Mbps - rate 72 */ 31250000, /* 250Mbps - rate 73 */ 34375000, /* 275Mbps - rate 74 */ 37500000, /* 300Mbps - rate 75 */ 40625000, /* 325Mbps - rate 76 */ 43750000, /* 350Mbps - rate 77 */ 46875000, /* 375Mbps - rate 78 */ 53125000, /* 425Mbps - rate 79 */ 56250000, /* 450Mbps - rate 80 */ 59375000, /* 475Mbps - rate 81 */ 62500000, /* 500Mbps - rate 82 */ 68750000, /* 550Mbps - rate 83 */ 75000000, /* 600Mbps - rate 84 */ 81250000, /* 650Mbps - rate 85 */ 87500000, /* 700Mbps - rate 86 */ 93750000, /* 750Mbps - rate 87 */ 106250000, /* 850Mbps - rate 88 */ 112500000, /* 900Mbps - rate 89 */ 125000000, /* 1Gbps - rate 90 */ 156250000, /* 1.25Gps - rate 91 */ 187500000, /* 1.5Gps - rate 92 */ 218750000, /* 1.75Gps - rate 93 */ 250000000, /* 2Gbps - rate 94 */ 281250000, /* 2.25Gps - rate 95 */ 312500000, /* 2.5Gbps - rate 96 */ 343750000, /* 2.75Gbps - rate 97 */ 375000000, /* 3Gbps - rate 98 */ 500000000, /* 4Gbps - rate 99 */ 625000000, /* 5Gbps - rate 100 */ 750000000, /* 6Gbps - rate 101 */ 875000000, /* 7Gbps - rate 102 */ 1000000000, /* 8Gbps - rate 103 */ 1125000000, /* 9Gbps - rate 104 */ 1250000000, /* 10Gbps - rate 105 */ 1875000000, /* 15Gbps - rate 106 */ 2500000000 /* 20Gbps - rate 107 */ }; #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) #define RS_ORDERED_COUNT 16 /* * Number that are in order * at the beginning of the table, * over this a sort is required. */ #define RS_NEXT_ORDER_GROUP 16 /* * The point in our table where * we come fill in a second ordered * group (index wise means -1). */ #define ALL_HARDWARE_RATES 1004 /* * 1Meg - 1Gig in 1 Meg steps * plus 100, 200k and 500k and * 10Gig */ #define RS_ONE_MEGABIT_PERSEC 1000000 #define RS_ONE_GIGABIT_PERSEC 1000000000 #define RS_TEN_GIGABIT_PERSEC 10000000000 static struct head_tcp_rate_set int_rs; static struct mtx rs_mtx; uint32_t rs_number_alive; uint32_t rs_number_dead; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Ratelimit stats"); SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, &rs_number_alive, 0, "Number of interfaces initialized for ratelimiting"); SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, &rs_number_dead, 0, "Number of interfaces departing from ratelimiting"); static void rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) { /* * Add sysctl entries for thus interface. */ if (rs->rs_flags & RS_INTF_NO_SUP) { SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "disable", CTLFLAG_RD, &rs->rs_disable, 0, "Disable this interface from new hdwr limiting?"); } else { SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "disable", CTLFLAG_RW, &rs->rs_disable, 0, "Disable this interface from new hdwr limiting?"); } SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "minseg", CTLFLAG_RW, &rs->rs_min_seg, 0, "What is the minimum we need to send on this interface?"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flow_limit", CTLFLAG_RW, &rs->rs_flow_limit, 0, "What is the limit for number of flows (0=unlimited)?"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "highest", CTLFLAG_RD, &rs->rs_highest_valid, 0, "Highest valid rate"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "lowest", CTLFLAG_RD, &rs->rs_lowest_valid, 0, "Lowest valid rate"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flags", CTLFLAG_RD, &rs->rs_flags, 0, "What lags are on the entry?"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "numrates", CTLFLAG_RD, &rs->rs_rate_cnt, 0, "How many rates re there?"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flows_using", CTLFLAG_RD, &rs->rs_flows_using, 0, "How many flows are using this interface now?"); #ifdef DETAILED_RATELIMIT_SYSCTL if (rs->rs_rlt && rs->rs_rate_cnt > 0) { /* Lets display the rates */ int i; struct sysctl_oid *rl_rates; struct sysctl_oid *rl_rate_num; char rate_num[16]; rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "rate", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ratelist"); for( i = 0; i < rs->rs_rate_cnt; i++) { sprintf(rate_num, "%d", i); rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rates), OID_AUTO, rate_num, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Individual Rate"); SYSCTL_ADD_U32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "flags", CTLFLAG_RD, &rs->rs_rlt[i].flags, 0, "Flags on this rate"); SYSCTL_ADD_U32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "pacetime", CTLFLAG_RD, &rs->rs_rlt[i].time_between, 0, "Time hardware inserts between 1500 byte sends"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "rate", CTLFLAG_RD, &rs->rs_rlt[i].rate, 0, "Rate in bytes per second"); } } #endif } static void rs_destroy(epoch_context_t ctx) { struct tcp_rate_set *rs; bool do_free_rs; rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); mtx_lock(&rs_mtx); rs->rs_flags &= ~RS_FUNERAL_SCHD; /* * In theory its possible (but unlikely) * that while the delete was occuring * and we were applying the DEAD flag * someone slipped in and found the * interface in a lookup. While we * decided rs_flows_using were 0 and * scheduling the epoch_call, the other * thread incremented rs_flow_using. This * is because users have a pointer and * we only use the rs_flows_using in an * atomic fashion, i.e. the other entities * are not protected. To assure this did * not occur, we check rs_flows_using here * before deleting. */ do_free_rs = (rs->rs_flows_using == 0); rs_number_dead--; mtx_unlock(&rs_mtx); if (do_free_rs) { sysctl_ctx_free(&rs->sysctl_ctx); free(rs->rs_rlt, M_TCPPACE); free(rs, M_TCPPACE); } } static void rs_defer_destroy(struct tcp_rate_set *rs) { mtx_assert(&rs_mtx, MA_OWNED); /* Check if already pending. */ if (rs->rs_flags & RS_FUNERAL_SCHD) return; rs_number_dead++; /* Set flag to only defer once. */ rs->rs_flags |= RS_FUNERAL_SCHD; NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); } #ifdef INET extern counter_u64_t rate_limit_set_ok; extern counter_u64_t rate_limit_active; extern counter_u64_t rate_limit_alloc_fail; #endif static int rl_attach_txrtlmt(struct ifnet *ifp, uint32_t flowtype, int flowid, uint64_t cfg_rate, struct m_snd_tag **tag) { int error; union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = cfg_rate, .rate_limit.flags = M_NOWAIT, }; - if (ifp->if_snd_tag_alloc == NULL) { - error = EOPNOTSUPP; - } else { - error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); + error = m_snd_tag_alloc(ifp, ¶ms, tag); #ifdef INET - if (error == 0) { - counter_u64_add(rate_limit_set_ok, 1); - counter_u64_add(rate_limit_active, 1); - } else - counter_u64_add(rate_limit_alloc_fail, 1); + if (error == 0) { + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else if (error != EOPNOTSUPP) + counter_u64_add(rate_limit_alloc_fail, 1); #endif - } return (error); } static void populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) { /* * The internal table is "special", it * is two seperate ordered tables that * must be merged. We get here when the * adapter specifies a number of rates that * covers both ranges in the table in some * form. */ int i, at_low, at_high; uint8_t low_disabled = 0, high_disabled = 0; for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { rs->rs_rlt[i].flags = 0; rs->rs_rlt[i].time_between = 0; if ((low_disabled == 0) && (high_disabled || (rate_table_act[at_low] < rate_table_act[at_high]))) { rs->rs_rlt[i].rate = rate_table_act[at_low]; at_low++; if (at_low == RS_NEXT_ORDER_GROUP) low_disabled = 1; } else if (high_disabled == 0) { rs->rs_rlt[i].rate = rate_table_act[at_high]; at_high++; if (at_high == MAX_HDWR_RATES) high_disabled = 1; } } } static struct tcp_rate_set * rt_setup_new_rs(struct ifnet *ifp, int *error) { struct tcp_rate_set *rs; const uint64_t *rate_table_act; uint64_t lentim, res; size_t sz; uint32_t hash_type; int i; struct if_ratelimit_query_results rl; struct sysctl_oid *rl_sysctl_root; /* * We expect to enter with the * mutex locked. */ if (ifp->if_ratelimit_query == NULL) { /* * We can do nothing if we cannot * get a query back from the driver. */ printf("Warning:No query functions for %s:%d-- failed\n", ifp->if_dname, ifp->if_dunit); return (NULL); } rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); if (rs == NULL) { if (error) *error = ENOMEM; printf("Warning:No memory for malloc of tcp_rate_set\n"); return (NULL); } memset(&rl, 0, sizeof(rl)); rl.flags = RT_NOSUPPORT; ifp->if_ratelimit_query(ifp, &rl); if (rl.flags & RT_IS_UNUSABLE) { /* * The interface does not really support * the rate-limiting. */ memset(rs, 0, sizeof(struct tcp_rate_set)); rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_flags = RS_INTF_NO_SUP; rs->rs_disable = 1; rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { memset(rs, 0, sizeof(struct tcp_rate_set)); rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_flags = RS_IS_DEFF; rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { /* Mellanox C4 likely */ rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_rate_cnt = rl.number_of_rates; rs->rs_min_seg = rl.min_segment_burst; rs->rs_highest_valid = 0; rs->rs_flow_limit = rl.max_flows; rs->rs_flags = RS_IS_INTF | RS_NO_PRE; rs->rs_disable = 0; rate_table_act = rl.rate_table; } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { /* Chelsio, C5 and C6 of Mellanox? */ rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_rate_cnt = rl.number_of_rates; rs->rs_min_seg = rl.min_segment_burst; rs->rs_disable = 0; rs->rs_flow_limit = rl.max_flows; rate_table_act = desired_rates; if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { /* * Our desired table is not big * enough, do what we can. */ rs->rs_rate_cnt = MAX_HDWR_RATES; } if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) rs->rs_flags = RS_IS_INTF; else rs->rs_flags = RS_IS_INTF | RS_INT_TBL; if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) rs->rs_rate_cnt = ALL_HARDWARE_RATES; } else { free(rs, M_TCPPACE); return (NULL); } sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); if (rs->rs_rlt == NULL) { if (error) *error = ENOMEM; bail: free(rs, M_TCPPACE); return (NULL); } if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { /* * The interface supports all * the rates we could possibly want. */ uint64_t rat; rs->rs_rlt[0].rate = 12500; /* 100k */ rs->rs_rlt[1].rate = 25000; /* 200k */ rs->rs_rlt[2].rate = 62500; /* 500k */ /* Note 125000 == 1Megabit * populate 1Meg - 1000meg. */ for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { rs->rs_rlt[i].rate = rat; rat += 125000; } rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; } else if (rs->rs_flags & RS_INT_TBL) { /* We populate this in a special way */ populate_canned_table(rs, rate_table_act); } else { /* * Just copy in the rates from * the table, it is in order. */ for (i=0; irs_rate_cnt; i++) { rs->rs_rlt[i].rate = rate_table_act[i]; rs->rs_rlt[i].time_between = 0; rs->rs_rlt[i].flags = 0; } } for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { /* * We go backwards through the list so that if we can't get * a rate and fail to init one, we have at least a chance of * getting the highest one. */ rs->rs_rlt[i].ptbl = rs; rs->rs_rlt[i].tag = NULL; /* * Calculate the time between. */ lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; res = lentim / rs->rs_rlt[i].rate; if (res > 0) rs->rs_rlt[i].time_between = res; else rs->rs_rlt[i].time_between = 1; if (rs->rs_flags & RS_NO_PRE) { rs->rs_rlt[i].flags = HDWRPACE_INITED; rs->rs_lowest_valid = i; } else { int err; if ((rl.flags & RT_IS_SETUP_REQ) && (ifp->if_ratelimit_query)) { err = ifp->if_ratelimit_setup(ifp, rs->rs_rlt[i].rate, i); if (err) goto handle_err; } #ifdef RSS hash_type = M_HASHTYPE_RSS_TCP_IPV4; #else hash_type = M_HASHTYPE_OPAQUE_HASH; #endif err = rl_attach_txrtlmt(ifp, hash_type, (i + 1), rs->rs_rlt[i].rate, &rs->rs_rlt[i].tag); if (err) { handle_err: if (i == (rs->rs_rate_cnt - 1)) { /* * Huh - first rate and we can't get * it? */ free(rs->rs_rlt, M_TCPPACE); if (error) *error = err; goto bail; } else { if (error) *error = err; } break; } else { rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; rs->rs_lowest_valid = i; } } } /* Did we get at least 1 rate? */ if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) rs->rs_highest_valid = rs->rs_rate_cnt - 1; else { free(rs->rs_rlt, M_TCPPACE); goto bail; } rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } static const struct tcp_hwrate_limit_table * tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) { struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; uint64_t mbits_per_sec, ind_calc; int i; mbits_per_sec = (bytes_per_sec * 8); if (flags & RS_PACING_LT) { if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ /* * Smaller than 1Meg, only * 3 entries can match it. */ for(i = rs->rs_lowest_valid; i < 3; i++) { if (bytes_per_sec <= rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { arte = &rs->rs_rlt[i]; } } goto done; } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ /* * Larger than 1G (the majority of * our table. */ if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; else arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; goto done; } /* * If we reach here its in our table (between 1Meg - 1000Meg), * just take the rounded down mbits per second, and add * 1Megabit to it, from this we can calculate * the index in the table. */ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) ind_calc++; /* our table is offset by 3, we add 2 */ ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if ((ind_calc >= rs->rs_lowest_valid) && (ind_calc <= rs->rs_highest_valid)) rte = &rs->rs_rlt[ind_calc]; } else if (flags & RS_PACING_EXACT_MATCH) { if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ for(i = rs->rs_lowest_valid; i < 3; i++) { if (bytes_per_sec == rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } } } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { /* > 1Gbps only one rate */ if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { /* Its 10G wow */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } } else { /* Ok it must be a exact meg (its between 1G and 1Meg) */ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { /* its an exact Mbps */ ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) rte = &rs->rs_rlt[ind_calc]; } } } else { /* we want greater than the requested rate */ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ for (i=2; i>=rs->rs_lowest_valid; i--) { if (bytes_per_sec < rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } else if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; break; } else { arte = &rs->rs_rlt[i]; /* new alternate */ } } } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ /* Our top rate is larger than the request */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } else if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { /* It matches our top rate */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { /* The top rate is an alternative */ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } } else { /* Its in our range 1Meg - 1Gig */ if (flags & RS_PACING_GEQ) { ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = (ALL_HARDWARE_RATES-1); } rte = &rs->rs_rlt[ind_calc]; } goto done; } ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) rte = &rs->rs_rlt[ind_calc]; } } done: if ((rte == NULL) && (arte != NULL) && (flags & RS_PACING_SUB_OK)) { /* We can use the substitute */ rte = arte; } return (rte); } static const struct tcp_hwrate_limit_table * tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) { /** * Hunt the rate table with the restrictions in flags and find a * suitable rate if possible. * RS_PACING_EXACT_MATCH - look for an exact match to rate. * RS_PACING_GT - must be greater than. * RS_PACING_GEQ - must be greater than or equal. * RS_PACING_LT - must be less than. * RS_PACING_SUB_OK - If we don't meet criteria a * substitute is ok. */ int i, matched; struct tcp_hwrate_limit_table *rte = NULL; if ((rs->rs_flags & RS_INT_TBL) && (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { /* * Here we don't want to paw thru * a big table, we have everything * from 1Meg - 1000Meg in 1Meg increments. * Use an alternate method to "lookup". */ return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); } if ((flags & RS_PACING_LT) || (flags & RS_PACING_EXACT_MATCH)) { /* * For exact and less than we go forward through the table. * This way when we find one larger we stop (exact was a * toss up). */ for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { if ((flags & RS_PACING_EXACT_MATCH) && (bytes_per_sec == rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; matched = 1; break; } else if ((flags & RS_PACING_LT) && (bytes_per_sec <= rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; matched = 1; break; } if (bytes_per_sec > rs->rs_rlt[i].rate) break; } if ((matched == 0) && (flags & RS_PACING_LT) && (flags & RS_PACING_SUB_OK)) { /* Kick in a substitute (the lowest) */ rte = &rs->rs_rlt[rs->rs_lowest_valid]; } } else { /* * Here we go backward through the table so that we can find * the one greater in theory faster (but its probably a * wash). */ for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { if (rs->rs_rlt[i].rate > bytes_per_sec) { /* A possible candidate */ rte = &rs->rs_rlt[i]; } if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[i].rate)) { /* An exact match and we want equal */ matched = 1; rte = &rs->rs_rlt[i]; break; } else if (rte) { /* * Found one that is larger than but don't * stop, there may be a more closer match. */ matched = 1; } if (rs->rs_rlt[i].rate < bytes_per_sec) { /* * We found a table entry that is smaller, * stop there will be none greater or equal. */ break; } } if ((matched == 0) && (flags & RS_PACING_SUB_OK)) { /* Kick in a substitute (the highest) */ rte = &rs->rs_rlt[rs->rs_highest_valid]; } } return (rte); } static struct ifnet * rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) { struct ifnet *tifp; struct m_snd_tag *tag; union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = 1, .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = COMMON_RATE, .rate_limit.flags = M_NOWAIT, }; int err; #ifdef RSS params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); #else params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; #endif - tag = NULL; - if (ifp->if_snd_tag_alloc) { - if (error) - *error = ENODEV; - return (NULL); - } - err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); + err = m_snd_tag_alloc(ifp, ¶ms, &tag); if (err) { /* Failed to setup a tag? */ if (error) *error = err; return (NULL); } tifp = tag->ifp; m_snd_tag_rele(tag); return (tifp); } static const struct tcp_hwrate_limit_table * rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, uint32_t flags, int *error) { /* First lets find the interface if it exists */ const struct tcp_hwrate_limit_table *rte; struct tcp_rate_set *rs; struct epoch_tracker et; int err; NET_EPOCH_ENTER(et); use_real_interface: CK_LIST_FOREACH(rs, &int_rs, next) { /* * Note we don't look with the lock since we either see a * new entry or will get one when we try to add it. */ if (rs->rs_flags & RS_IS_DEAD) { /* The dead are not looked at */ continue; } if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { /* Ok we found it */ break; } } if ((rs == NULL) || (rs->rs_flags & RS_INTF_NO_SUP) || (rs->rs_flags & RS_IS_DEAD)) { /* * This means we got a packet *before* * the IF-UP was processed below, * while or after we already received an interface * departed event. In either case we really don't * want to do anything with pacing, in * the departing case the packet is not * going to go very far. The new case * might be arguable, but its impossible * to tell from the departing case. */ if (rs->rs_disable && error) *error = ENODEV; NET_EPOCH_EXIT(et); return (NULL); } if ((rs == NULL) || (rs->rs_disable != 0)) { if (rs->rs_disable && error) *error = ENOSPC; NET_EPOCH_EXIT(et); return (NULL); } if (rs->rs_flags & RS_IS_DEFF) { /* We need to find the real interface */ struct ifnet *tifp; tifp = rt_find_real_interface(ifp, inp, error); if (tifp == NULL) { if (rs->rs_disable && error) *error = ENOTSUP; NET_EPOCH_EXIT(et); return (NULL); } goto use_real_interface; } if (rs->rs_flow_limit && ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { if (error) *error = ENOSPC; NET_EPOCH_EXIT(et); return (NULL); } rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); if (rte) { err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, inp->inp_flowtype, inp->inp_flowid, rte->rate, &inp->inp_snd_tag); if (err) { /* Failed to attach */ if (error) *error = err; rte = NULL; } } if (rte) { /* * We use an atomic here for accounting so we don't have to * use locks when freeing. */ atomic_add_64(&rs->rs_flows_using, 1); } NET_EPOCH_EXIT(et); return (rte); } static void tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) { int error; struct tcp_rate_set *rs; if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || (link_state != LINK_STATE_UP)) { /* * We only care on an interface going up that is rate-limit * capable. */ return; } mtx_lock(&rs_mtx); CK_LIST_FOREACH(rs, &int_rs, next) { if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { /* We already have initialized this guy */ mtx_unlock(&rs_mtx); return; } } mtx_unlock(&rs_mtx); rt_setup_new_rs(ifp, &error); } static void tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) { struct tcp_rate_set *rs, *nrs; int i; mtx_lock(&rs_mtx); CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { CK_LIST_REMOVE(rs, next); rs_number_alive--; rs->rs_flags |= RS_IS_DEAD; for (i = 0; i < rs->rs_rate_cnt; i++) { if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { in_pcbdetach_tag(rs->rs_rlt[i].tag); rs->rs_rlt[i].tag = NULL; } rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; } if (rs->rs_flows_using == 0) rs_defer_destroy(rs); break; } } mtx_unlock(&rs_mtx); } static void tcp_rl_shutdown(void *arg __unused, int howto __unused) { struct tcp_rate_set *rs, *nrs; int i; mtx_lock(&rs_mtx); CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { CK_LIST_REMOVE(rs, next); rs_number_alive--; rs->rs_flags |= RS_IS_DEAD; for (i = 0; i < rs->rs_rate_cnt; i++) { if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { in_pcbdetach_tag(rs->rs_rlt[i].tag); rs->rs_rlt[i].tag = NULL; } rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; } if (rs->rs_flows_using == 0) rs_defer_destroy(rs); } mtx_unlock(&rs_mtx); } const struct tcp_hwrate_limit_table * tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, uint64_t bytes_per_sec, int flags, int *error) { const struct tcp_hwrate_limit_table *rte; #ifdef KERN_TLS struct ktls_session *tls; #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_inpcb->inp_snd_tag == NULL) { /* * We are setting up a rate for the first time. */ if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { /* Not supported by the egress */ if (error) *error = ENODEV; return (NULL); } #ifdef KERN_TLS tls = NULL; if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || tls->mode != TCP_TLS_MODE_IFNET) { if (error) *error = ENODEV; return (NULL); } } #endif rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); #ifdef KERN_TLS if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { /* * Fake a route change error to reset the TLS * send tag. This will convert the existing * tag to a TLS ratelimit tag. */ MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS); ktls_output_eagain(tp->t_inpcb, tls); } #endif } else { /* * We are modifying a rate, wrong interface? */ if (error) *error = EINVAL; rte = NULL; } tp->t_pacing_rate = rte->rate; *error = 0; return (rte); } const struct tcp_hwrate_limit_table * tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp, struct ifnet *ifp, uint64_t bytes_per_sec, int flags, int *error) { const struct tcp_hwrate_limit_table *nrte; const struct tcp_rate_set *rs; #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif int is_indirect = 0; int err; INP_WLOCK_ASSERT(tp->t_inpcb); if (crte == NULL) { /* Wrong interface */ if (error) *error = EINVAL; return (NULL); } #ifdef KERN_TLS if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag != NULL && tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { /* * NIC probably doesn't support ratelimit TLS * tags if it didn't allocate one when an * existing rate was present, so ignore. */ if (error) *error = EOPNOTSUPP; return (NULL); } } #endif if (tp->t_inpcb->inp_snd_tag == NULL) { /* Wrong interface */ if (error) *error = EINVAL; return (NULL); } rs = crte->ptbl; if ((rs->rs_flags & RS_IS_DEAD) || (crte->flags & HDWRPACE_IFPDEPARTED)) { /* Release the rate, and try anew */ re_rate: tcp_rel_pacing_rate(crte, tp); nrte = tcp_set_pacing_rate(tp, ifp, bytes_per_sec, flags, error); return (nrte); } if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) is_indirect = 1; else is_indirect = 0; if ((is_indirect == 0) && ((ifp != rs->rs_ifp) || (ifp->if_dunit != rs->rs_if_dunit))) { /* * Something changed, the user is not pointing to the same * ifp? Maybe a route updated on this guy? */ goto re_rate; } else if (is_indirect) { /* * For indirect we have to dig in and find the real interface. */ struct ifnet *rifp; rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); if (rifp == NULL) { /* Can't find it? */ goto re_rate; } if ((rifp != rs->rs_ifp) || (ifp->if_dunit != rs->rs_if_dunit)) { goto re_rate; } } nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); if (nrte == crte) { /* No change */ if (error) *error = 0; return (crte); } if (nrte == NULL) { /* Release the old rate */ tcp_rel_pacing_rate(crte, tp); return (NULL); } /* Change rates to our new entry */ #ifdef KERN_TLS if (tls != NULL) err = ktls_modify_txrtlmt(tls, nrte->rate); else #endif err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); if (err) { if (error) *error = err; return (NULL); } if (error) *error = 0; tp->t_pacing_rate = nrte->rate; return (nrte); } void tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) { const struct tcp_rate_set *crs; struct tcp_rate_set *rs; uint64_t pre; INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_pacing_rate = -1; crs = crte->ptbl; /* * Now we must break the const * in order to release our refcount. */ rs = __DECONST(struct tcp_rate_set *, crs); pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); if (pre == 1) { mtx_lock(&rs_mtx); /* * Is it dead? */ if (rs->rs_flags & RS_IS_DEAD) rs_defer_destroy(rs); mtx_unlock(&rs_mtx); } /* * XXX: If this connection is using ifnet TLS, should we * switch it to using an unlimited rate, or perhaps use * ktls_output_eagain() to reset the send tag to a plain * TLS tag? */ in_pcbdetach_txrtlmt(tp->t_inpcb); } #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ uint32_t tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss, const struct tcp_hwrate_limit_table *te, int *err) { /* * We use the google formula to calculate the * TSO size. I.E. * bw < 24Meg * tso = 2mss * else * tso = min(bw/1000, 64k) * * Note for these calculations we ignore the * packet overhead (enet hdr, ip hdr and tcp hdr). */ uint64_t lentim, res, bytes; uint32_t new_tso, min_tso_segs; bytes = bw / 1000; if (bytes > (64 * 1000)) bytes = 64 * 1000; /* Round up */ new_tso = (bytes + segsiz - 1) / segsiz; if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) min_tso_segs = 1; else min_tso_segs = 2; if (new_tso < min_tso_segs) new_tso = min_tso_segs; if (new_tso > MAX_MSS_SENT) new_tso = MAX_MSS_SENT; new_tso *= segsiz; /* * If we are not doing hardware pacing * then we are done. */ if (te == NULL) { if (err) *err = 0; return(new_tso); } /* * For hardware pacing we look at the * rate you are sending at and compare * that to the rate you have in hardware. * * If the hardware rate is slower than your * software rate then you are in error and * we will build a queue in our hardware whic * is probably not desired, in such a case * just return the non-hardware TSO size. * * If the rate in hardware is faster (which * it should be) then look at how long it * takes to send one ethernet segment size at * your b/w and compare that to the time it * takes to send at the rate you had selected. * * If your time is greater (which we hope it is) * we get the delta between the two, and then * divide that into your pacing time. This tells * us how many MSS you can send down at once (rounded up). * * Note we also double this value if the b/w is over * 100Mbps. If its over 500meg we just set you to the * max (43 segments). */ if (te->rate > FIVE_HUNDRED_MBPS) return (segsiz * MAX_MSS_SENT); if (te->rate == bw) { /* We are pacing at exactly the hdwr rate */ return (segsiz * MAX_MSS_SENT); } lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; res = lentim / bw; if (res > te->time_between) { uint32_t delta, segs; delta = res - te->time_between; segs = (res + delta - 1)/delta; if (te->rate > ONE_HUNDRED_MBPS) segs *= 2; if (segs < min_tso_segs) segs = min_tso_segs; if (segs > MAX_MSS_SENT) segs = MAX_MSS_SENT; segs *= segsiz; if (err) *err = 0; if (segs < new_tso) { /* unexpected ? */ return(new_tso); } else { return (segs); } } else { /* * Your time is smaller which means * we will grow a queue on our * hardware. Send back the non-hardware * rate. */ if (err) *err = -1; return (new_tso); } } static eventhandler_tag rl_ifnet_departs; static eventhandler_tag rl_ifnet_arrives; static eventhandler_tag rl_shutdown_start; static void tcp_rs_init(void *st __unused) { CK_LIST_INIT(&int_rs); rs_number_alive = 0; rs_number_dead = 0; mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, tcp_rl_ifnet_departure, NULL, EVENTHANDLER_PRI_ANY); rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, tcp_rl_ifnet_link, NULL, EVENTHANDLER_PRI_ANY); rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_rl_shutdown, NULL, SHUTDOWN_PRI_FIRST); printf("TCP_ratelimit: Is now initialized\n"); } SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); #endif diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 6b5a6141dd27..e7d958da2037 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -1,1620 +1,1623 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #include #ifdef WITNESS #include #endif #endif #ifdef _KERNEL #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__getjcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #define M_NODOM 255 #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Static network interface owned tag. * Allocated through ifp->if_snd_tag_alloc(). */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ volatile u_int refcount; u_int type; /* One of IF_SND_TAG_TYPE_*. */ }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { union { struct m_snd_tag *snd_tag; /* send tag, if any */ struct ifnet *rcvif; /* rcv interface */ }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_t rcv_tstmp; /* timestamp in ns */ struct { uint8_t l2hlen; /* layer 2 hdr len */ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ uint8_t inner_l2hlen; uint8_t inner_l3hlen; uint8_t inner_l4hlen; uint8_t inner_l5hlen; }; }; union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] /* mld and v6-ND */ #define tso_segsz PH_per.sixteen[1] /* inbound after LRO */ #define lro_nsegs tso_segsz /* inbound after LRO */ #define csum_data PH_per.thirtytwo[1] /* inbound from hardware up */ #define lro_len PH_loc.sixteen[0] /* inbound during LRO (no reassembly) */ #define lro_csum PH_loc.sixteen[1] /* inbound during LRO (no reassembly) */ /* Note PH_loc is used during IP reassembly (all 8 bytes as a ptr) */ /* * TLS records for TLS 1.0-1.2 can have the following header lengths: * - 5 (AES-CBC with implicit IV) * - 21 (AES-CBC with explicit IV) * - 13 (AES-GCM with 8 byte explicit IV) */ #define MBUF_PEXT_HDR_LEN 23 /* * TLS records for TLS 1.0-1.2 can have the following maximum trailer * lengths: * - 16 (AES-GCM) * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding) * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding) * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding) */ #define MBUF_PEXT_TRAIL_LEN 64 #if defined(__LP64__) #define MBUF_PEXT_MAX_PGS (40 / sizeof(vm_paddr_t)) #else #define MBUF_PEXT_MAX_PGS (72 / sizeof(vm_paddr_t)) #endif #define MBUF_PEXT_MAX_BYTES \ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) struct ktls_session; struct socket; /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ typedef void m_ext_free_t(struct mbuf *); struct m_ext { union { /* * If EXT_FLAG_EMBREF is set, then we use refcount in the * mbuf, the 'ext_count' member. Otherwise, we have a * shadow copy and we use pointer 'ext_cnt'. The original * mbuf is responsible to carry the pointer to free routine * and its arguments. They aren't copied into shadows in * mb_dupcl() to avoid dereferencing next cachelines. */ volatile u_int ext_count; volatile u_int *ext_cnt; }; uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ union { struct { /* * Regular M_EXT mbuf: * o ext_buf always points to the external buffer. * o ext_free (below) and two optional arguments * ext_arg1 and ext_arg2 store the free context for * the external storage. They are set only in the * refcount carrying mbuf, the one with * EXT_FLAG_EMBREF flag, with exclusion for * EXT_EXTREF type, where the free context is copied * into all mbufs that use same external storage. */ char *ext_buf; /* start of buffer */ #define m_ext_copylen offsetof(struct m_ext, ext_arg2) void *ext_arg2; }; struct { /* * Multi-page M_EXTPG mbuf: * o extpg_pa - page vector. * o extpg_trail and extpg_hdr - TLS trailer and * header. * Uses ext_free and may also use ext_arg1. */ vm_paddr_t extpg_pa[MBUF_PEXT_MAX_PGS]; char extpg_trail[MBUF_PEXT_TRAIL_LEN]; char extpg_hdr[MBUF_PEXT_HDR_LEN]; /* Pretend these 3 fields are part of mbuf itself. */ #define m_epg_pa m_ext.extpg_pa #define m_epg_trail m_ext.extpg_trail #define m_epg_hdr m_ext.extpg_hdr #define m_epg_ext_copylen offsetof(struct m_ext, ext_free) }; }; /* * Free method and optional argument pointer, both * used by M_EXT and M_EXTPG. */ m_ext_free_t *ext_free; void *ext_arg1; }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { union { /* M_PKTHDR set. */ struct pkthdr m_pkthdr; /* M_EXTPG set. * Multi-page M_EXTPG mbuf has its meta data * split between the below anonymous structure * and m_ext. It carries vector of pages, * optional header and trailer char vectors * and pointers to socket/TLS data. */ #define m_epg_startcopy m_epg_npgs #define m_epg_endcopy m_epg_stailq struct { /* Overall count of pages and count of * pages with I/O pending. */ uint8_t m_epg_npgs; uint8_t m_epg_nrdy; /* TLS header and trailer lengths. * The data itself resides in m_ext. */ uint8_t m_epg_hdrlen; uint8_t m_epg_trllen; /* Offset into 1st page and length of * data in the last page. */ uint16_t m_epg_1st_off; uint16_t m_epg_last_len; uint8_t m_epg_flags; #define EPG_FLAG_ANON 0x1 /* Data can be encrypted in place. */ #define EPG_FLAG_2FREE 0x2 /* Scheduled for free. */ uint8_t m_epg_record_type; uint8_t __spare[2]; int m_epg_enc_cnt; struct ktls_session *m_epg_tls; struct socket *m_epg_so; uint64_t m_epg_seqno; STAILQ_ENTRY(mbuf) m_epg_stailq; }; }; union { /* M_EXT or M_EXTPG set. */ struct m_ext m_ext; /* M_PKTHDR set, neither M_EXT nor M_EXTPG. */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; #ifdef _KERNEL static inline int m_epg_pagelen(const struct mbuf *m, int pidx, int pgoff) { KASSERT(pgoff == 0 || pidx == 0, ("page %d with non-zero offset %d in %p", pidx, pgoff, m)); if (pidx == m->m_epg_npgs - 1) { return (m->m_epg_last_len); } else { return (PAGE_SIZE - pgoff); } } #ifdef INVARIANTS #define MCHECK(ex, msg) KASSERT((ex), \ ("Multi page mbuf %p with " #msg " at %s:%d", \ m, __FILE__, __LINE__)) /* * NB: This expects a non-empty buffer (npgs > 0 and * last_pg_len > 0). */ #define MBUF_EXT_PGS_ASSERT_SANITY(m) do { \ MCHECK(m->m_epg_npgs > 0, "no valid pages"); \ MCHECK(m->m_epg_npgs <= nitems(m->m_epg_pa), \ "too many pages"); \ MCHECK(m->m_epg_nrdy <= m->m_epg_npgs, \ "too many ready pages"); \ MCHECK(m->m_epg_1st_off < PAGE_SIZE, \ "too large page offset"); \ MCHECK(m->m_epg_last_len > 0, "zero last page length"); \ MCHECK(m->m_epg_last_len <= PAGE_SIZE, \ "too large last page length"); \ if (m->m_epg_npgs == 1) \ MCHECK(m->m_epg_1st_off + \ m->m_epg_last_len <= PAGE_SIZE, \ "single page too large"); \ MCHECK(m->m_epg_hdrlen <= sizeof(m->m_epg_hdr), \ "too large header length"); \ MCHECK(m->m_epg_trllen <= sizeof(m->m_epg_trail), \ "too large header length"); \ } while (0) #else #define MBUF_EXT_PGS_ASSERT_SANITY(m) do {} while (0); #endif #endif /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-11] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_EXTPG 0x00000100 /* has array of unmapped pages and TLS */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ #define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00002000 /* protocol-specific */ #define M_PROTO2 0x00004000 /* protocol-specific */ #define M_PROTO3 0x00008000 /* protocol-specific */ #define M_PROTO4 0x00010000 /* protocol-specific */ #define M_PROTO5 0x00020000 /* protocol-specific */ #define M_PROTO6 0x00040000 /* protocol-specific */ #define M_PROTO7 0x00080000 /* protocol-specific */ #define M_PROTO8 0x00100000 /* protocol-specific */ #define M_PROTO9 0x00200000 /* protocol-specific */ #define M_PROTO10 0x00400000 /* protocol-specific */ #define M_PROTO11 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG\11M_EXTPG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO" #define M_FLAG_PROTOBITS \ "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \ "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \ "\27M_PROTO10\28M_PROTO11" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. * * The meaning of the IPV6_EX suffix: * "o Home address from the home address option in the IPv6 destination * options header. If the extension header is not present, use the Source * IPv6 Address. * o IPv6 address that is contained in the Routing-Header-Type-2 from the * associated extension header. If the extension header is not present, * use the Destination IPv6 Address." * Quoted from: * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_RXRING 8 /* data in NIC receive ring */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* These flags are vendor */ #define EXT_FLAG_VENDOR2 0x020000 /* or submodule specific, */ #define EXT_FLAG_VENDOR3 0x040000 /* not used by mbuf code. */ #define EXT_FLAG_VENDOR4 0x080000 /* Set/read by submodule. */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. Note that the outbound and the inbound flags do * not collide right now but they could be allowed to (as long as the flags are * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX. * * CSUM_INNER_ is the same as CSUM_ but it applies to the inner frame. * The CSUM_ENCAP_ bits identify the outer encapsulation. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_INNER_IP6_UDP 0x00000040 #define CSUM_INNER_IP6_TCP 0x00000080 #define CSUM_INNER_IP6_TSO 0x00000100 #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ #define CSUM_INNER_IP 0x00004000 #define CSUM_INNER_IP_UDP 0x00008000 #define CSUM_INNER_IP_TCP 0x00010000 #define CSUM_INNER_IP_TSO 0x00020000 #define CSUM_ENCAP_VXLAN 0x00040000 /* VXLAN outer encapsulation */ #define CSUM_ENCAP_RSVD1 0x00080000 /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_INNER_L3_CALC 0x00100000 #define CSUM_INNER_L3_VALID 0x00200000 #define CSUM_INNER_L4_CALC 0x00400000 #define CSUM_INNER_L4_VALID 0x00800000 #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ #define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ #define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \ CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \ CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \ CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \ CSUM_ENCAP_RSVD1 | CSUM_SND_TAG) #define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \ CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \ CSUM_COALESCED) /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \ "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \ "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \ "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \ "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \ "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \ "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \ "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_INNER_TSO (CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_EXTCONTROL 15 /* control message with externalized contents */ #define MT_OOBDATA 16 /* expedited data */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs" #ifdef _KERNEL +union if_snd_tag_alloc_params; #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; extern uma_zone_t zone_extpgs; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void mb_free_extpg(struct mbuf *); void mb_free_mext_pgs(struct mbuf *); struct mbuf *mb_alloc_ext_pgs(int, m_ext_free_t); struct mbuf *mb_alloc_ext_plus_pages(int, int); struct mbuf *mb_mapped_to_unmapped(struct mbuf *, int, int, int, struct mbuf **); int mb_unmapped_compress(struct mbuf *m); struct mbuf *mb_unmapped_to_ext(struct mbuf *m); void mb_free_notready(struct mbuf *m, int count); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); void m_dispose_extcontrolm(struct mbuf *m); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, char *, u_int, m_ext_free_t, void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); int m_unmappedtouio(const struct mbuf *, int, struct uio *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); +int m_snd_tag_alloc(struct ifnet *, + union if_snd_tag_alloc_params *, struct m_snd_tag **); void m_snd_tag_init(struct m_snd_tag *, struct ifnet *, u_int); void m_snd_tag_destroy(struct m_snd_tag *); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt, m_ext_free_t freef, void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_EXTPG)) == 0 && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* Check if mbuf is multipage. */ #define M_ASSERTEXTPG(m) \ KASSERT(((m)->m_flags & (M_EXTPG|M_PKTHDR)) == M_EXTPG, \ ("%s: m %p is not multipage!", __func__, m)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXTPG) ? NULL : \ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Return the rcvif of a packet header. */ static __inline struct ifnet * m_rcvif(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) return (NULL); return (m->m_pkthdr.rcvif); } /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static inline struct m_snd_tag * m_snd_tag_ref(struct m_snd_tag *mst) { refcount_acquire(&mst->refcount); return (mst); } static inline void m_snd_tag_rele(struct m_snd_tag *mst) { if (refcount_release(&mst->refcount)) m_snd_tag_destroy(mst); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXTPG) mb_free_extpg(m); else if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_xxx_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t); uint32_t m_infiniband_tcpip_hash_init(void); uint32_t m_infiniband_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } /* * Note: this doesn't enforce the maximum list size for dst. */ static inline void mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src) { mq_dst->mq_len += mq_src->mq_len; STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head); mq_src->mq_len = 0; } #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m)); KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } #endif #ifdef DEBUGNET /* Invoked from the debugnet client code. */ void debugnet_mbuf_drain(void); void debugnet_mbuf_start(void); void debugnet_mbuf_finish(void); void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif static inline bool mbuf_has_tls_session(struct mbuf *m) { if (m->m_flags & M_EXTPG) { if (m->m_epg_tls != NULL) { return (true); } } return (false); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */