Index: head/sys/dev/cxgbe/cxgbei/cxgbei.c =================================================================== --- head/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 335923) +++ head/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 335924) @@ -1,823 +1,824 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Chelsio T5xx iSCSI driver * * Written by: Sreenivasa Honnur * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ #include "tom/t4_tom.h" #include "cxgbei.h" static int worker_thread_count; static struct cxgbei_worker_thread_softc *cwt_softc; static struct proc *cxgbei_proc; /* XXXNP some header instead. */ struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); static void free_ci_counters(struct cxgbei_data *ci) { #define FREE_CI_COUNTER(x) do { \ if (ci->x != NULL) { \ counter_u64_free(ci->x); \ ci->x = NULL; \ } \ } while (0) FREE_CI_COUNTER(ddp_setup_ok); FREE_CI_COUNTER(ddp_setup_error); FREE_CI_COUNTER(ddp_bytes); FREE_CI_COUNTER(ddp_pdus); FREE_CI_COUNTER(fl_bytes); FREE_CI_COUNTER(fl_pdus); #undef FREE_CI_COUNTER } static int alloc_ci_counters(struct cxgbei_data *ci) { #define ALLOC_CI_COUNTER(x) do { \ ci->x = counter_u64_alloc(M_WAITOK); \ if (ci->x == NULL) \ goto fail; \ } while (0) ALLOC_CI_COUNTER(ddp_setup_ok); ALLOC_CI_COUNTER(ddp_setup_error); ALLOC_CI_COUNTER(ddp_bytes); ALLOC_CI_COUNTER(ddp_pdus); ALLOC_CI_COUNTER(fl_bytes); ALLOC_CI_COUNTER(fl_pdus); #undef ALLOC_CI_COUNTER return (0); fail: free_ci_counters(ci); return (ENOMEM); } static void read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, uint32_t *max_rx_pdu_len) { uint32_t tx_len, rx_len, r, v; rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); r = t4_read_reg(sc, A_TP_PARA_REG2); rx_len = min(rx_len, G_MAXRXDATA(r)); tx_len = min(tx_len, G_MAXRXDATA(r)); r = t4_read_reg(sc, A_TP_PARA_REG7); v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); rx_len = min(rx_len, v); tx_len = min(tx_len, v); /* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */ tx_len = min(tx_len, 3 * 4096); *max_tx_pdu_len = rounddown2(tx_len, 512); *max_rx_pdu_len = rounddown2(rx_len, 512); } /* * Initialize the software state of the iSCSI ULP driver. * * ENXIO means firmware didn't set up something that it was supposed to. */ static int cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) { struct sysctl_oid *oid; struct sysctl_oid_list *children; struct ppod_region *pr; uint32_t r; int rc; MPASS(sc->vres.iscsi.size > 0); MPASS(ci != NULL); rc = alloc_ci_counters(ci); if (rc != 0) return (rc); read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len); pr = &ci->pr; r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); if (rc != 0) { device_printf(sc->dev, "%s: failed to initialize the iSCSI page pod region: %u.\n", __func__, rc); free_ci_counters(ci); return (rc); } r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); r &= V_ISCSITAGMASK(M_ISCSITAGMASK); if (r != pr->pr_tag_mask) { /* * Recent firmwares are supposed to set up the iSCSI tagmask * but we'll do it ourselves it the computed value doesn't match * what's in the register. */ device_printf(sc->dev, "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, pr->pr_tag_mask); t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); } sysctl_ctx_init(&ci->ctx); oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD, NULL, "iSCSI ULP statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok", CTLFLAG_RD, &ci->ddp_setup_ok, "# of times DDP buffer was setup successfully."); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error", CTLFLAG_RD, &ci->ddp_setup_error, "# of times DDP buffer setup failed."); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes", CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly"); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus", CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly."); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes", CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist"); SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus", CTLFLAG_RD, &ci->fl_pdus, "# of PDUs with data delivered in freelist"); ci->ddp_threshold = 2048; SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); return (0); } static int do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct icl_pdu *ip; struct icl_cxgbei_pdu *icp; uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); uint16_t len = be16toh(cpl->len); M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; icp = ip_to_icp(ip); icp->icp_seq = ntohl(cpl->seq); icp->icp_flags = ICPF_RX_HDR; /* This is the start of a new PDU. There should be no old state. */ MPASS(toep->ulpcb2 == NULL); toep->ulpcb2 = icp; #if 0 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", __func__, tid, len, len_ddp, icp); #endif m_freem(m); return (0); } static int do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct icl_cxgbei_pdu *icp = toep->ulpcb2; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); /* Must already have received the header (but not the data). */ MPASS(icp != NULL); MPASS(icp->icp_flags == ICPF_RX_HDR); MPASS(icp->ip.ip_data_mbuf == NULL); m_adj(m, sizeof(*cpl)); MPASS(icp->ip.ip_data_len == m->m_pkthdr.len); icp->icp_flags |= ICPF_RX_FLBUF; icp->ip.ip_data_mbuf = m; counter_u64_add(ci->fl_pdus, 1); counter_u64_add(ci->fl_bytes, m->m_pkthdr.len); #if 0 CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid, be16toh(cpl->len)); #endif return (0); } static int do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cxgbei_data *ci = sc->iscsi_ulp_softc; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct icl_cxgbei_conn *icc; struct icl_conn *ic; struct icl_cxgbei_pdu *icp = toep->ulpcb2; struct icl_pdu *ip; u_int pdu_len, val; + struct epoch_tracker et; MPASS(m == NULL); /* Must already be assembling a PDU. */ MPASS(icp != NULL); MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); pdu_len = be16toh(cpl->len); /* includes everything. */ val = be32toh(cpl->ddpvld); #if 0 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", __func__, tid, pdu_len, val, icp->icp_flags); #endif icp->icp_flags |= ICPF_RX_STATUS; ip = &icp->ip; if (val & F_DDP_PADDING_ERR) icp->icp_flags |= ICPF_PAD_ERR; if (val & F_DDP_HDRCRC_ERR) icp->icp_flags |= ICPF_HCRC_ERR; if (val & F_DDP_DATACRC_ERR) icp->icp_flags |= ICPF_DCRC_ERR; if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); MPASS(ip->ip_data_len > 0); icp->icp_flags |= ICPF_RX_DDP; counter_u64_add(ci->ddp_pdus, 1); counter_u64_add(ci->ddp_bytes, ip->ip_data_len); } INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, pdu_len, inp->inp_flags); INP_WUNLOCK(inp); icl_cxgbei_conn_pdu_free(NULL, ip); #ifdef INVARIANTS toep->ulpcb2 = NULL; #endif return (0); } tp = intotcpcb(inp); MPASS(icp->icp_seq == tp->rcv_nxt); MPASS(tp->rcv_wnd >= pdu_len); tp->rcv_nxt += pdu_len; tp->rcv_wnd -= pdu_len; tp->t_rcvtime = ticks; /* update rx credits */ toep->rx_credits += pdu_len; t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */ so = inp->inp_socket; sb = &so->so_rcv; SOCKBUF_LOCK(sb); icc = toep->ulpcb; if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { CTR5(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", __func__, tid, pdu_len, icc, sb->sb_state); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); icl_cxgbei_conn_pdu_free(NULL, ip); #ifdef INVARIANTS toep->ulpcb2 = NULL; #endif return (0); } MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ic = &icc->ic; icl_cxgbei_new_pdu_set_conn(ip, ic); MPASS(m == NULL); /* was unused, we'll use it now. */ m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ if (__predict_false(m != NULL)) { int len = m_length(m, NULL); /* * PDUs were received before the tid transitioned to ULP mode. * Convert them to icl_cxgbei_pdus and send them to ICL before * the PDU in icp/ip. */ CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, len); /* XXXNP: needs to be rewritten. */ if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct iscsi_bhs)) { struct icl_cxgbei_pdu *icp0; struct icl_pdu *ip0; ip0 = icl_cxgbei_new_pdu(M_NOWAIT); icl_cxgbei_new_pdu_set_conn(ip0, ic); if (ip0 == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); icp0 = ip_to_icp(ip0); icp0->icp_seq = 0; /* XXX */ icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS; m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); } m_freem(m); } STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); if ((icc->rx_flags & RXF_ACTIVE) == 0) { struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; mtx_lock(&cwt->cwt_lock); icc->rx_flags |= RXF_ACTIVE; TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); if (cwt->cwt_state == CWT_SLEEPING) { cwt->cwt_state = CWT_RUNNING; cv_signal(&cwt->cwt_cv); } mtx_unlock(&cwt->cwt_lock); } SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); #ifdef INVARIANTS toep->ulpcb2 = NULL; #endif return (0); } static int cxgbei_activate(struct adapter *sc) { struct cxgbei_data *ci; int rc; ASSERT_SYNCHRONIZED_OP(sc); if (uld_active(sc, ULD_ISCSI)) { KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", __func__, sc)); return (0); } if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { device_printf(sc->dev, "not iSCSI offload capable, or capability disabled.\n"); return (ENOSYS); } /* per-adapter softc for iSCSI */ ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); if (ci == NULL) return (ENOMEM); rc = cxgbei_init(sc, ci); if (rc != 0) { free(ci, M_CXGBE); return (rc); } sc->iscsi_ulp_softc = ci; return (0); } static int cxgbei_deactivate(struct adapter *sc) { struct cxgbei_data *ci = sc->iscsi_ulp_softc; ASSERT_SYNCHRONIZED_OP(sc); if (ci != NULL) { sysctl_ctx_free(&ci->ctx); t4_free_ppod_region(&ci->pr); free_ci_counters(ci); free(ci, M_CXGBE); sc->iscsi_ulp_softc = NULL; } return (0); } static void cxgbei_activate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) return; /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static void cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) return; if (uld_active(sc, ULD_ISCSI)) (void) t4_deactivate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static struct uld_info cxgbei_uld_info = { .uld_id = ULD_ISCSI, .activate = cxgbei_activate, .deactivate = cxgbei_deactivate, }; static void cwt_main(void *arg) { struct cxgbei_worker_thread_softc *cwt = arg; struct icl_cxgbei_conn *icc = NULL; struct icl_conn *ic; struct icl_pdu *ip; struct sockbuf *sb; STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); MPASS(cwt != NULL); mtx_lock(&cwt->cwt_lock); MPASS(cwt->cwt_state == 0); cwt->cwt_state = CWT_RUNNING; cv_signal(&cwt->cwt_cv); while (__predict_true(cwt->cwt_state != CWT_STOP)) { cwt->cwt_state = CWT_RUNNING; while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); mtx_unlock(&cwt->cwt_lock); ic = &icc->ic; sb = &ic->ic_socket->so_rcv; SOCKBUF_LOCK(sb); MPASS(icc->rx_flags & RXF_ACTIVE); if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { MPASS(STAILQ_EMPTY(&rx_pdus)); STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); SOCKBUF_UNLOCK(sb); /* Hand over PDUs to ICL. */ while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); ic->ic_receive(ip); } SOCKBUF_LOCK(sb); MPASS(STAILQ_EMPTY(&rx_pdus)); } MPASS(icc->rx_flags & RXF_ACTIVE); if (STAILQ_EMPTY(&icc->rcvd_pdus) || __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { icc->rx_flags &= ~RXF_ACTIVE; } else { /* * More PDUs were received while we were busy * handing over the previous batch to ICL. * Re-add this connection to the end of the * queue. */ mtx_lock(&cwt->cwt_lock); TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); mtx_unlock(&cwt->cwt_lock); } SOCKBUF_UNLOCK(sb); mtx_lock(&cwt->cwt_lock); } /* Inner loop doesn't check for CWT_STOP, do that first. */ if (__predict_false(cwt->cwt_state == CWT_STOP)) break; cwt->cwt_state = CWT_SLEEPING; cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); } MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); mtx_assert(&cwt->cwt_lock, MA_OWNED); cwt->cwt_state = CWT_STOPPED; cv_signal(&cwt->cwt_cv); mtx_unlock(&cwt->cwt_lock); kthread_exit(); } static int start_worker_threads(void) { int i, rc; struct cxgbei_worker_thread_softc *cwt; worker_thread_count = min(mp_ncpus, 32); cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, M_WAITOK | M_ZERO); MPASS(cxgbei_proc == NULL); for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); cv_init(&cwt->cwt_cv, "cwt cv"); TAILQ_INIT(&cwt->rx_head); rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, "cxgbei", "%d", i); if (rc != 0) { printf("cxgbei: failed to start thread #%d/%d (%d)\n", i + 1, worker_thread_count, rc); mtx_destroy(&cwt->cwt_lock); cv_destroy(&cwt->cwt_cv); bzero(cwt, sizeof(*cwt)); if (i == 0) { free(cwt_softc, M_CXGBE); worker_thread_count = 0; return (rc); } /* Not fatal, carry on with fewer threads. */ worker_thread_count = i; rc = 0; break; } /* Wait for thread to start before moving on to the next one. */ mtx_lock(&cwt->cwt_lock); while (cwt->cwt_state == 0) cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); mtx_unlock(&cwt->cwt_lock); } MPASS(cwt_softc != NULL); MPASS(worker_thread_count > 0); return (0); } static void stop_worker_threads(void) { int i; struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; MPASS(worker_thread_count >= 0); for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { mtx_lock(&cwt->cwt_lock); MPASS(cwt->cwt_state == CWT_RUNNING || cwt->cwt_state == CWT_SLEEPING); cwt->cwt_state = CWT_STOP; cv_signal(&cwt->cwt_cv); do { cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); } while (cwt->cwt_state != CWT_STOPPED); mtx_unlock(&cwt->cwt_lock); } free(cwt_softc, M_CXGBE); } /* Select a worker thread for a connection. */ u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) { struct adapter *sc = icc->sc; struct toepcb *toep = icc->toep; u_int i, n; n = worker_thread_count / sc->sge.nofldrxq; if (n > 0) i = toep->vi->pi->port_id * n + arc4random() % n; else i = arc4random() % worker_thread_count; CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); return (i); } static int cxgbei_mod_load(void) { int rc; t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); rc = start_worker_threads(); if (rc != 0) return (rc); rc = t4_register_uld(&cxgbei_uld_info); if (rc != 0) { stop_worker_threads(); return (rc); } t4_iterate(cxgbei_activate_all, NULL); return (rc); } static int cxgbei_mod_unload(void) { t4_iterate(cxgbei_deactivate_all, NULL); if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) return (EBUSY); stop_worker_threads(); t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); return (0); } #endif static int cxgbei_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = cxgbei_mod_load(); if (rc == 0) rc = icl_cxgbei_mod_load(); break; case MOD_UNLOAD: rc = icl_cxgbei_mod_unload(); if (rc == 0) rc = cxgbei_mod_unload(); break; default: rc = EINVAL; } #else printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t cxgbei_mod = { "cxgbei", cxgbei_modevent, NULL, }; MODULE_VERSION(cxgbei, 1); DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); MODULE_DEPEND(cxgbei, icl, 1, 1, 1); Index: head/sys/dev/cxgbe/tom/t4_connect.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_connect.c (revision 335923) +++ head/sys/dev/cxgbe/tom/t4_connect.c (revision 335924) @@ -1,502 +1,503 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * Active open succeeded. */ static int do_act_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_establish *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); u_int atid = G_TID_TID(ntohl(cpl->tos_atid)); struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_flowc_wr(toep, NULL); send_reset(sc, toep, be32toh(cpl->snd_isn)); goto done; } make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); if (toep->ulp_mode == ULP_MODE_TLS) tls_establish(toep); done: INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) { struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; + struct epoch_tracker et; free_atid(sc, atid); toep->tid = -1; CURVNET_SET(toep->vnet); if (status != EAGAIN) - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); } /* * Active open failed. */ static int do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct toepcb *toep = lookup_atid(sc, atid); int rc; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); /* Ignore negative advice */ if (negative_advice(status)) return (0); if (status && act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), toep->ctrlq); rc = act_open_rpl_status_to_errno(status); act_open_failure_cleanup(sc, atid, rc); return (0); } /* * Options2 for active open. */ static uint32_t calc_opt2a(struct socket *so, struct toepcb *toep, const struct offload_settings *s) { struct tcpcb *tp = so_sototcpcb(so); struct port_info *pi = toep->vi->pi; struct adapter *sc = pi->adapter; uint32_t opt2 = 0; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) opt2 |= F_SACK_EN; if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) opt2 |= F_TSTAMPS_EN; if (tp->t_flags & TF_REQ_SCALE) opt2 |= F_WND_SCALE_EN; if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) opt2 |= F_CCTRL_ECN; /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); /* These defaults are subject to ULP specific fixups later. */ opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); opt2 |= V_PACE(0); if (s->cong_algo >= 0) opt2 |= V_CONG_CNTRL(s->cong_algo); else if (sc->tt.cong_algorithm >= 0) opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); else if (strcasecmp(cc->name, "tahoe") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); if (strcasecmp(cc->name, "newreno") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); if (strcasecmp(cc->name, "highspeed") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); } } if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); /* Note that ofld_rxq is already set according to s->rxq. */ opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (toep->ulp_mode == ULP_MODE_TCPDDP) opt2 |= F_RX_FC_DDP; #endif if (toep->ulp_mode == ULP_MODE_TLS) { opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } return (htobe32(opt2)); } void t4_init_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl, CPL_COOKIE_TOM); } void t4_uninit_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM); } #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ reason = __LINE__; \ rc = (x); \ goto failed; \ } while (0) static inline int act_open_cpl_size(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { sizeof (struct cpl_act_open_req), sizeof (struct cpl_act_open_req6) }, { sizeof (struct cpl_t5_act_open_req), sizeof (struct cpl_t5_act_open_req6) }, { sizeof (struct cpl_t6_act_open_req), sizeof (struct cpl_t6_act_open_req6) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } /* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct tom_data *td = tod_td(tod); struct toepcb *toep = NULL; struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct vi_info *vi; int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; struct offload_settings settings; uint16_t vid = 0xffff; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (rt_ifp->if_type == IFT_ETHER) vi = rt_ifp->if_softc; else if (rt_ifp->if_type == IFT_L2VLAN) { struct ifnet *ifp = VLAN_COOKIE(rt_ifp); vi = ifp->if_softc; VLAN_TAG(rt_ifp, &vid); } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, vid, inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); if (settings.txq >= 0 && settings.txq < vi->nofldtxq) txqid = settings.txq; else txqid = arc4random() % vi->nofldtxq; txqid += vi->first_ofld_txq; if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq) rxqid = settings.rxq; else rxqid = arc4random() % vi->nofldrxq; rxqid += vi->first_ofld_rxq; toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); /* * The kernel sets request_r_scale based on sb_max whereas we need to * take hardware's MAX_RCV_WND into account too. This is normally a * no-op as MAX_RCV_WND is much larger than the default sb_max. */ if (tp->t_flags & TF_REQ_SCALE) rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep->ce = hold_lip(td, &inp->in6p_laddr, NULL); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, toep->tid, tcpstates[tp->t_state], toep, inp); offload_socket(so, toep); rc = t4_l2t_send(sc, wr, toep->l2te); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); reason = __LINE__; failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) release_lip(td, toep->ce); free_toepcb(toep); } return (rc); } #endif Index: head/sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_cpl_io.c (revision 335923) +++ head/sys/dev/cxgbe/tom/t4_cpl_io.c (revision 335924) @@ -1,2345 +1,2349 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static void t4_aiotx_cancel(struct kaiocb *job); static void t4_aiotx_queue_toep(struct toepcb *toep); static size_t aiotx_mbuf_pgoff(struct mbuf *m) { struct aiotx_buffer *ab; MPASS(IS_AIOTX_MBUF(m)); ab = m->m_ext.ext_arg1; return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); } static vm_page_t * aiotx_mbuf_pages(struct mbuf *m) { struct aiotx_buffer *ab; int npages; MPASS(IS_AIOTX_MBUF(m)); ab = m->m_ext.ext_arg1; npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; return (ab->ps.pages + npages); } void send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) { struct wrqe *wr; struct fw_flowc_wr *flowc; unsigned int nparams, flowclen, paramidx; struct vi_info *vi = toep->vi; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), ("%s: flowc for tid %u sent already", __func__, toep->tid)); if (ftxp != NULL) nparams = 8; else nparams = 6; if (toep->ulp_mode == ULP_MODE_TLS) nparams++; if (toep->tls.fcplenmax != 0) nparams++; if (toep->tc_idx != -1) { MPASS(toep->tc_idx >= 0 && toep->tc_idx < sc->chip_params->nsched_cls); nparams++; } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); #define FLOWC_PARAM(__m, __v) \ do { \ flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ flowc->mnemval[paramidx].val = htobe32(__v); \ paramidx++; \ } while (0) paramidx = 0; FLOWC_PARAM(PFNVFN, pfvf); FLOWC_PARAM(CH, pi->tx_chan); FLOWC_PARAM(PORT, pi->tx_chan); FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); if (ftxp) { uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); FLOWC_PARAM(SNDBUF, sndbuf); FLOWC_PARAM(MSS, ftxp->mss); CTR6(KTR_CXGBE, "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, ftxp->rcv_nxt); } else { FLOWC_PARAM(SNDBUF, 512); FLOWC_PARAM(MSS, 512); CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); } if (toep->ulp_mode == ULP_MODE_TLS) FLOWC_PARAM(ULP_MODE, toep->ulp_mode); if (toep->tls.fcplenmax != 0) FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); if (toep->tc_idx != -1) FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; toep->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } #ifdef RATELIMIT /* * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second. */ static int update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) { int tc_idx, rc; const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; const int port_id = toep->vi->pi->port_id; CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); if (kbps == 0) { /* unbind */ tc_idx = -1; } else { rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); if (rc != 0) return (rc); MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); } if (toep->tc_idx != tc_idx) { struct wrqe *wr; struct fw_flowc_wr *flowc; int nparams = 1, flowclen, flowclen16; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { if (tc_idx >= 0) t4_release_cl_rl_kbps(sc, port_id, tc_idx); return (ENOMEM); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; if (tc_idx == -1) flowc->mnemval[0].val = htobe32(0xff); else flowc->mnemval[0].val = htobe32(tc_idx); txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } if (toep->tc_idx >= 0) t4_release_cl_rl_kbps(sc, port_id, toep->tc_idx); toep->tc_idx = tc_idx; return (0); } #endif void send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) { struct wrqe *wr; struct cpl_abort_req *req; int tid = toep->tid; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ INP_WLOCK_ASSERT(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", __func__, toep->tid, inp->inp_flags & INP_DROPPED ? "inp dropped" : tcpstates[tp->t_state], toep->flags, inp->inp_flags, toep->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (toep->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ toep->flags |= TPF_ABORT_SHUTDOWN; KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %d.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); if (inp->inp_flags & INP_DROPPED) req->rsvd0 = htobe32(snd_nxt); else req->rsvd0 = htobe32(tp->snd_nxt); req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); req->cmd = CPL_ABORT_SEND_RST; /* * XXX: What's the correct way to tell that the inp hasn't been detached * from its socket? Should I even be flushing the snd buffer here? */ if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) /* because I'm not sure. See comment above */ sbflush(&so->so_snd); } t4_l2t_send(sc, wr, toep->l2te); } /* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, unsigned int opt) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tp->t_inpcb; struct adapter *sc = td_adapter(toep->td); int n; INP_LOCK_ASSERT(inp); if (inp->inp_inc.inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; if (G_TCPOPT_TSTAMP(opt)) { tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__, toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg); if (G_TCPOPT_SACK(opt)) tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ else tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } } /* * Completes some final bits of initialization for just established connections * and changes their state to TCPS_ESTABLISHED. * * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. */ void make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, uint16_t opt) { struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); long bufsize; uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ uint16_t tcpopt = be16toh(opt); struct flowc_tx_params ftxp; INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED, ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", __func__, toep->tid, so, inp, tp, toep); tp->t_state = TCPS_ESTABLISHED; tp->t_starttime = ticks; TCPSTAT_INC(tcps_connects); tp->irs = irs; tcp_rcvseqinit(tp); tp->rcv_wnd = toep->rx_credits << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; /* * If we were unable to send all rx credits via opt0, save the remainder * in rx_credits so that they can be handed over with the next credit * update. */ SOCKBUF_LOCK(&so->so_rcv); bufsize = select_rcv_wnd(so); SOCKBUF_UNLOCK(&so->so_rcv); toep->rx_credits = bufsize - tp->rcv_wnd; tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; tp->snd_nxt = iss + 1; tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) bufsize = V_tcp_autosndbuf_max; else bufsize = sbspace(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); ftxp.snd_nxt = tp->snd_nxt; ftxp.rcv_nxt = tp->rcv_nxt; ftxp.snd_space = bufsize; ftxp.mss = tp->t_maxseg; send_flowc_wr(toep, &ftxp); soisconnected(so); } int send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct wrqe *wr; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return (0); req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); t4_wrq_tx(sc, wr); return (credits); } void send_rx_modulate(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_rx_data_ack *req; wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return; req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); req->credit_dack = htobe32(F_RX_MODULATE_RX); t4_wrq_tx(sc, wr); } void t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; int credits; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK_ASSERT(sb); KASSERT(toep->sb_cc >= sbused(sb), ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sbused(sb), toep->sb_cc)); credits = toep->sb_cc - sbused(sb); toep->sb_cc = sbused(sb); if (toep->ulp_mode == ULP_MODE_TLS) { if (toep->tls.rcv_over >= credits) { toep->tls.rcv_over -= credits; credits = 0; } else { credits -= toep->tls.rcv_over; toep->tls.rcv_over = 0; } } toep->rx_credits += credits; if (toep->rx_credits > 0 && (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { credits = send_rx_credits(sc, toep, toep->rx_credits); toep->rx_credits -= credits; tp->rcv_wnd += credits; tp->rcv_adv += credits; } else if (toep->flags & TPF_FORCE_CREDITS) send_rx_modulate(sc, toep); } void t4_rcvd(struct toedev *tod, struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; SOCKBUF_LOCK(sb); t4_rcvd_locked(tod, tp); SOCKBUF_UNLOCK(sb); } /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. */ int t4_close_conn(struct adapter *sc, struct toepcb *toep) { struct wrqe *wr; struct cpl_close_con_req *req; unsigned int tid = toep->tid; CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); if (toep->flags & TPF_FIN_SENT) return (0); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, tid)); wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | V_FW_WR_FLOWID(tid)); req->wr.wr_lo = cpu_to_be64(0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); req->rsvd = 0; toep->flags |= TPF_FIN_SENT; toep->flags &= ~TPF_SEND_FIN; t4_l2t_send(sc, wr, toep->l2te); return (0); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) /* Maximum amount of immediate data we could stuff in a WR */ static inline int max_imm_payload(int tx_credits) { const int n = 2; /* Use only up to 2 desc for imm. data WR */ KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_OFLD_TX_CREDITS) return (0); if (tx_credits >= (n * EQ_ESIZE) / 16) return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); else return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); } /* Maximum number of SGL entries we could stuff in a WR */ static inline int max_dsgl_nsegs(int tx_credits) { int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits < MIN_OFLD_TX_CREDITS) return (0); nseg += 2 * (sge_pair_credits * 16 / 24); if ((sge_pair_credits * 16) % 24 == 16) nseg++; return (nseg); } static inline void write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) { struct fw_ofld_tx_data_wr *txwr = dst; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); txwr->plen = htobe32(plen); if (txalign > 0) { struct tcpcb *tp = intotcpcb(toep->inp); if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi)) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | (tp->t_flags & TF_NODELAY ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); } } /* * Generate a DSGL from a starting mbuf. The total number of segments and the * maximum segments in any one mbuf are provided. */ static void write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); i = -1; for (m = start; m != stop; m = m->m_next) { if (IS_AIOTX_MBUF(m)) rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), aiotx_mbuf_pgoff(m), m->m_len); else rc = sglist_append(&sg, mtod(m, void *), m->m_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", __func__, nsegs, start, stop)); } /* * Max number of SGL entries an offload tx work request can have. This is 41 * (1 + 40) for a full 512B work request. * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) */ #define OFLD_SGL_LEN (41) /* * Send data and/or a FIN to the peer. * * The socket's so_snd buffer consists of a stream of data starting with sb_mb * and linked together with m_next. sb_sndptr, if set, is the last mbuf that * was transmitted. * * drop indicates the number of bytes that should be dropped from the head of * the send buffer. It is an optimization that lets do_fw4_ack avoid creating * contention on the send buffer lock (before this change it used to do * sowwakeup and then t4_push_frames right after that when recovering from tx * stalls). When drop is set this function MUST drop the bytes and wake up any * writers. */ void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m, *sb_sndptr; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tx_credits, shove, compl, sowwakeup; struct ofld_tx_sdesc *txsd; bool aiotx_mbuf_seen; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(toep->ulp_mode == ULP_MODE_NONE || toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS || toep->ulp_mode == ULP_MODE_RDMA, ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; do { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits); max_nsegs = max_dsgl_nsegs(tx_credits); SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } sb_sndptr = sb->sb_sndptr; sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ aiotx_mbuf_seen = false; for (m = sndptr; m != NULL; m = m->m_next) { int n; if (IS_AIOTX_MBUF(m)) n = sglist_count_vmpages(aiotx_mbuf_pages(m), aiotx_mbuf_pgoff(m), m->m_len); else n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* This mbuf sent us _over_ the nsegs limit, back out */ if (plen > max_imm && nsegs > max_nsegs) { nsegs -= n; plen -= m->m_len; if (plen == 0) { /* Too few credits */ toep->flags |= TPF_TX_SUSPENDED; if (sowwakeup) { if (!TAILQ_EMPTY( &toep->aiotx_jobq)) t4_aiotx_queue_toep( toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); return; } break; } if (IS_AIOTX_MBUF(m)) aiotx_mbuf_seen = true; if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ /* This mbuf put us right at the max_nsegs limit */ if (plen > max_imm && nsegs == max_nsegs) { m = m->m_next; break; } } if (sbused(sb) > sb->sb_hiwat * 5 / 8 && toep->plen_nocompl + plen >= sb->sb_hiwat / 4) compl = 1; else compl = 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) { if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(toep); sowwakeup_locked(so); } else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); /* nothing to send */ if (plen == 0) { KASSERT(m == NULL, ("%s: nothing to send, but m != NULL", __func__)); break; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); if (plen <= max_imm && !aiotx_mbuf_seen) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, sc->tt.tx_align); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, sc->tt.tx_align); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) compl = 1; if (compl || toep->ulp_mode == ULP_MODE_RDMA) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); sb->sb_sndptr = sb_sndptr; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } while (m != NULL); /* Send a FIN if requested, but only if there's no more data to send */ if (m == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } static inline void rqdrop_locked(struct mbufq *q, int plen) { struct mbuf *m; while (plen > 0) { m = mbufq_dequeue(q); /* Too many credits. */ MPASS(m != NULL); M_ASSERTPKTHDR(m); /* Partial credits. */ MPASS(plen >= m->m_pkthdr.len); plen -= m->m_pkthdr.len; m_freem(m); } } void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m; struct fw_ofld_tx_data_wr *txwr; struct wrqe *wr; u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; u_int adjusted_plen, ulp_submode; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); int tx_credits, shove; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; struct mbufq *pduq = &toep->ulp_pduq; static const u_int ulp_extra_len[] = {0, 4, 4, 8}; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } if (drop) rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); while ((sndptr = mbufq_first(pduq)) != NULL) { M_ASSERTPKTHDR(sndptr); tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits); max_nsegs = max_dsgl_nsegs(tx_credits); plen = 0; nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ for (m = sndptr; m != NULL; m = m->m_next) { int n = sglist_count(mtod(m, void *), m->m_len); nsegs += n; plen += m->m_len; /* * This mbuf would send us _over_ the nsegs limit. * Suspend tx because the PDU can't be sent out. */ if (plen > max_imm && nsegs > max_nsegs) { toep->flags |= TPF_TX_SUSPENDED; return; } if (max_nsegs_1mbuf < n) max_nsegs_1mbuf = n; } if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* * We have a PDU to send. All of it goes out in one WR so 'm' * is NULL. A PDU's length is always a multiple of 4. */ MPASS(m == NULL); MPASS((plen & 3) == 0); MPASS(sndptr->m_pkthdr.len == plen); shove = !(tp->t_flags & TF_MORETOCOME); ulp_submode = mbuf_ulp_submode(sndptr); MPASS(ulp_submode < nitems(ulp_extra_len)); /* * plen doesn't include header and data digests, which are * generated and inserted in the right places by the TOE, but * they do occupy TCP sequence space and need to be accounted * for. */ adjusted_plen = plen + ulp_extra_len[ulp_submode]; if (plen <= max_imm) { /* Immediate data tx */ wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, plen, adjusted_plen, credits, shove, ulp_submode, sc->tt.tx_align); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { int wr_len; /* DSGL tx */ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, 0, adjusted_plen, credits, shove, ulp_submode, sc->tt.tx_align); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { uint64_t *pad = (uint64_t *) ((uintptr_t)txwr + wr_len); *pad = 0; } } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); m = mbufq_dequeue(pduq); MPASS(m == sndptr); mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); toep->tx_credits -= credits; toep->tx_nocompl += credits; toep->plen_nocompl += plen; if (toep->tx_credits <= toep->tx_total * 3 / 8 && toep->tx_nocompl >= toep->tx_total / 4) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; } tp->snd_nxt += adjusted_plen; tp->snd_max += adjusted_plen; toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TX_CREDITS) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); } /* Send a FIN if requested, but only if there are no more PDUs to send */ if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); } int t4_tod_output(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); if (toep->ulp_mode == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, 0); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, 0); else t4_push_frames(sc, toep, 0); return (0); } int t4_send_fin(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #ifdef INVARIANTS struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); toep->flags |= TPF_SEND_FIN; if (tp->t_state >= TCPS_ESTABLISHED) { if (toep->ulp_mode == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, 0); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, 0); else t4_push_frames(sc, toep, 0); } return (0); } int t4_send_rst(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; #if defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); /* hmmmm */ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc for tid %u [%s] not sent already", __func__, toep->tid, tcpstates[tp->t_state])); send_reset(sc, toep, 0); return (0); } /* * Peer has sent us a FIN. */ static int do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_peer_close *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so; + struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PEER_CLOSE, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (__predict_false(toep->flags & TPF_SYNQE)) { #ifdef INVARIANTS struct synq_entry *synqe = (void *)toep; INP_WLOCK(synqe->lctx->inp); if (synqe->flags & TPF_SYNQE_HAS_L2TE) { KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: listen socket closed but tid %u not aborted.", __func__, tid)); } else { /* * do_pass_accept_req is still running and will * eventually take care of this tid. */ } INP_WUNLOCK(synqe->lctx->inp); #endif CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; tp->rcv_nxt++; /* FIN */ so = inp->inp_socket; if (toep->ulp_mode == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) handle_ddp_close(toep, tp, cpl->rcv_nxt); DDP_UNLOCK(toep); } socantrcvmore(so); if (toep->ulp_mode != ULP_MODE_RDMA) { KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); } switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); return (0); default: log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", __func__, tid, tp->t_state); } done: INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } /* * Peer has ACK'd our FIN. */ static int do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; struct socket *so = NULL; + struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_CON_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tp = intotcpcb(inp); CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; so = inp->inp_socket; tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ switch (tp->t_state) { case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ return (0); case TCPS_LAST_ACK: if (tcp_close(tp)) INP_WUNLOCK(inp); goto release; case TCPS_FIN_WAIT_1: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) soisdisconnected(so); tp->t_state = TCPS_FIN_WAIT_2; break; default: log(LOG_ERR, "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", __func__, tid, tcpstates[tp->t_state]); } done: INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } void send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, int rst_status) { struct wrqe *wr; struct cpl_abort_rpl *cpl; wr = alloc_wrqe(sizeof(*cpl), ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } cpl = wrtod(wr); INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); cpl->cmd = rst_status; t4_wrq_tx(sc, wr); } static int abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: return (ETIMEDOUT); default: return (EIO); } } /* * TCP RST from the peer, timeout, or some other such critical error. */ static int do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct sge_wrq *ofld_txq = toep->ofld_txq; struct inpcb *inp; struct tcpcb *tp; + struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_req_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); if (negative_advice(cpl->status)) { CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", __func__, cpl->status, tid, toep->flags); return (0); /* Ignore negative advice */ } inp = toep->inp; CURVNET_SET(toep->vnet); - INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp->inp_flags, cpl->status); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (toep->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } toep->flags |= TPF_ABORT_SHUTDOWN; if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { struct socket *so = inp->inp_socket; if (so != NULL) so_error_set(so, abort_status_to_errno(tp, cpl->status)); tp = tcp_close(tp); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } final_cpl_received(toep); done: - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } /* * Reply to the CPL_ABORT_REQ (send_reset) */ static int do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); if (toep->flags & TPF_SYNQE) return (do_abort_rpl_synqe(iq, rss, m)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", __func__, tid, toep, inp, cpl->status); KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply", __func__)); INP_WLOCK(inp); final_cpl_received(toep); return (0); } static int do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; + struct epoch_tracker et; int len; uint32_t ddp_placed = 0; if (__predict_false(toep->flags & TPF_SYNQE)) { #ifdef INVARIANTS struct synq_entry *synqe = (void *)toep; INP_WLOCK(synqe->lctx->inp); if (synqe->flags & TPF_SYNQE_HAS_L2TE) { KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: listen socket closed but tid %u not aborted.", __func__, tid)); } else { /* * do_pass_accept_req is still running and will * eventually take care of this tid. */ } INP_WUNLOCK(synqe->lctx->inp); #endif CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, toep, toep->flags); m_freem(m); return (0); } KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } tp = intotcpcb(inp); if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; tp->rcv_nxt += len; if (tp->rcv_wnd < len) { KASSERT(toep->ulp_mode == ULP_MODE_RDMA, ("%s: negative window size", __func__)); } tp->rcv_wnd -= len; tp->t_rcvtime = ticks; if (toep->ulp_mode == ULP_MODE_TCPDDP) DDP_LOCK(toep); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, len); m_freem(m); SOCKBUF_UNLOCK(sb); if (toep->ulp_mode == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else toep->rx_credits += newsize - hiwat; } if (toep->ulp_mode == ULP_MODE_TCPDDP) { int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, tid, len); if (changed) { if (toep->ddp.flags & DDP_SC_REQ) toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; else { KASSERT(cpl->ddp_off == 1, ("%s: DDP switched on by itself.", __func__)); /* Fell out of DDP mode */ toep->ddp.flags &= ~DDP_ON; CTR1(KTR_CXGBE, "%s: fell out of DDP mode", __func__); insert_ddp_data(toep, ddp_placed); } } if (toep->ddp.flags & DDP_ON) { /* * CPL_RX_DATA with DDP on can only be an indicate. * Start posting queued AIO requests via DDP. The * payload that arrived in this indicate is appended * to the socket buffer as usual. */ handle_ddp_indicate(toep); } } KASSERT(toep->sb_cc >= sbused(sb), ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sbused(sb), toep->sb_cc)); toep->rx_credits += toep->sb_cc - sbused(sb); sbappendstream_locked(sb, m, 0); toep->sb_cc = sbused(sb); if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { int credits; credits = send_rx_credits(sc, toep, toep->rx_credits); toep->rx_credits -= credits; tp->rcv_wnd += credits; tp->rcv_adv += credits; } if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); ddp_queue_toep(toep); } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); if (toep->ulp_mode == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); #endif so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; if (txsd->iv_buffer) { free(txsd->iv_buffer, M_CXGBE); txsd->iv_buffer = NULL; } txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (toep->tx_credits == toep->tx_total) { toep->tx_nocompl = 0; toep->plen_nocompl = 0; } if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, tid); #endif toep->flags &= ~TPF_TX_SUSPENDED; CURVNET_SET(toep->vnet); if (toep->ulp_mode == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, plen); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, plen); else t4_push_frames(sc, toep, plen); CURVNET_RESTORE(); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; SOCKBUF_LOCK(sb); sbu = sbused(sb); if (toep->ulp_mode == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* * The data trasmitted before the tid's ULP mode * changed to ISCSI is still in so_snd. * Incoming credits should account for so_snd * first. */ sbdrop_locked(sb, min(sbu, plen)); plen -= min(sbu, plen); } sowwakeup_locked(so); /* unlocks so_snd */ rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); } else { #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, tid, plen); #endif sbdrop_locked(sb, plen); if (tls_tx_key(toep)) { struct tls_ofld_info *tls_ofld = &toep->tls; MPASS(tls_ofld->sb_off >= plen); tls_ofld->sb_off -= plen; } if (!TAILQ_EMPTY(&toep->aiotx_jobq)) t4_aiotx_queue_toep(toep); sowwakeup_locked(so); /* unlocks so_snd */ } SOCKBUF_UNLOCK_ASSERT(sb); } INP_WUNLOCK(inp); return (0); } void t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) { struct wrqe *wr; struct cpl_set_tcb_field *req; struct ofld_tx_sdesc *txsd; MPASS((cookie & ~M_COOKIE) == 0); if (reply) { MPASS(cookie != CPL_COOKIE_RESERVED); } wr = alloc_wrqe(sizeof(*req), wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); if (reply == 0) req->reply_ctrl |= htobe16(F_NO_REPLY); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); req->mask = htobe64(mask); req->val = htobe64(val); if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = howmany(sizeof(*req), 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; } t4_wrq_tx(sc, wr); } void t4_init_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); } void t4_uninit_cpl_io_handlers(void) { t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_RX_DATA, NULL); t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * sent by the AIO job so far and the 'backend4' field to hold an * error that should be reported when the job is completed. */ #define aio_sent backend3 #define aio_error backend4 #define jobtotid(job) \ (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) static void free_aiotx_buffer(struct aiotx_buffer *ab) { struct kaiocb *job; long status; int error; if (refcount_release(&ab->refcount) == 0) return; job = ab->job; error = job->aio_error; status = job->aio_sent; vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); free(ab, M_CXGBE); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, jobtotid(job), job, status, error); #endif if (error == ECANCELED && status != 0) error = 0; if (error == ECANCELED) aio_cancel(job); else if (error) aio_complete(job, -1, error); else aio_complete(job, status, 0); } static void t4_aiotx_mbuf_free(struct mbuf *m) { struct aiotx_buffer *ab = m->m_ext.ext_arg1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, m->m_len, jobtotid(ab->job)); #endif free_aiotx_buffer(ab); } /* * Hold the buffer backing an AIO request and return an AIO transmit * buffer. */ static int hold_aio(struct kaiocb *job) { struct aiotx_buffer *ab; struct vmspace *vm; vm_map_t map; vm_offset_t start, end, pgoff; int n; MPASS(job->backend1 == NULL); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf; pgoff = start & PAGE_MASK; end = round_page(start + job->uaiocb.aio_nbytes); start = trunc_page(start); n = atop(end - start); ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | M_ZERO); refcount_init(&ab->refcount, 1); ab->ps.pages = (vm_page_t *)(ab + 1); ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, VM_PROT_WRITE, ab->ps.pages, n); if (ab->ps.npages < 0) { free(ab, M_CXGBE); return (EFAULT); } KASSERT(ab->ps.npages == n, ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); ab->ps.offset = pgoff; ab->ps.len = job->uaiocb.aio_nbytes; ab->job = job; job->backend1 = ab; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); #endif return (0); } static void t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) { struct adapter *sc; struct sockbuf *sb; struct file *fp; struct aiotx_buffer *ab; struct inpcb *inp; struct tcpcb *tp; struct mbuf *m; int error; bool moretocome, sendmore; sc = td_adapter(toep->td); sb = &so->so_snd; SOCKBUF_UNLOCK(sb); fp = job->fd_file; ab = job->backend1; m = NULL; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error != 0) goto out; #endif if (ab == NULL) { error = hold_aio(job); if (error != 0) goto out; ab = job->backend1; } /* Inline sosend_generic(). */ job->msgsnd = 1; error = sblock(sb, SBL_WAIT); MPASS(error == 0); sendanother: m = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(sb); sbunlock(sb); if ((so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); sbunlock(sb); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(sb); sbunlock(sb); error = ENOTCONN; goto out; } if (sbspace(sb) < sb->sb_lowat) { MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); /* * Don't block if there is too little room in the socket * buffer. Instead, requeue the request. */ if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); sbunlock(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); sbunlock(sb); goto out; } /* * Write as much data as the socket permits, but no more than a * a single sndbuf at a time. */ m->m_len = sbspace(sb); if (m->m_len > ab->ps.len - job->aio_sent) { m->m_len = ab->ps.len - job->aio_sent; moretocome = false; } else moretocome = true; if (m->m_len > sc->tt.sndbuf) { m->m_len = sc->tt.sndbuf; sendmore = true; } else sendmore = false; if (!TAILQ_EMPTY(&toep->aiotx_jobq)) moretocome = true; SOCKBUF_UNLOCK(sb); MPASS(m->m_len != 0); /* Inlined tcp_usr_send(). */ inp = toep->inp; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); sbunlock(sb); error = ECONNRESET; goto out; } refcount_acquire(&ab->refcount); m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); m->m_ext.ext_flags |= EXT_FLAG_AIOTX; job->aio_sent += m->m_len; sbappendstream(sb, m, 0); m = NULL; if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); if (moretocome) tp->t_flags |= TF_MORETOCOME; error = tp->t_fb->tfb_tcp_output(tp); if (moretocome) tp->t_flags &= ~TF_MORETOCOME; } INP_WUNLOCK(inp); if (sendmore) goto sendanother; sbunlock(sb); if (error) goto out; /* * If this is a non-blocking socket and the request has not * been fully completed, requeue it until the socket is ready * again. */ if (job->aio_sent < job->uaiocb.aio_nbytes && !(so->so_state & SS_NBIO)) { SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); return; } /* * If the request will not be requeued, drop a reference on * the aiotx buffer. Any mbufs in flight should still * contain a reference, but this drops the reference that the * job owns while it is waiting to queue mbufs to the socket. */ free_aiotx_buffer(ab); out: if (error) { if (ab != NULL) { job->aio_error = error; free_aiotx_buffer(ab); } else { MPASS(job->aio_sent == 0); aio_complete(job, -1, error); } } if (m != NULL) m_free(m); SOCKBUF_LOCK(sb); } static void t4_aiotx_task(void *context, int pending) { struct toepcb *toep = context; struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct kaiocb *job; CURVNET_SET(toep->vnet); SOCKBUF_LOCK(&so->so_snd); while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { job = TAILQ_FIRST(&toep->aiotx_jobq); TAILQ_REMOVE(&toep->aiotx_jobq, job, list); if (!aio_clear_cancel_function(job)) continue; t4_aiotx_process_job(toep, so, job); } toep->aiotx_task_active = false; SOCKBUF_UNLOCK(&so->so_snd); CURVNET_RESTORE(); free_toepcb(toep); } static void t4_aiotx_queue_toep(struct toepcb *toep) { SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); #endif if (toep->aiotx_task_active) return; toep->aiotx_task_active = true; hold_toepcb(toep); soaio_enqueue(&toep->aiotx_task); } static void t4_aiotx_cancel(struct kaiocb *job) { struct aiotx_buffer *ab; struct socket *so; struct sockbuf *sb; struct tcpcb *tp; struct toepcb *toep; so = job->fd_file->f_data; tp = so_sototcpcb(so); toep = tp->t_toe; MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); sb = &so->so_snd; SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); ab = job->backend1; if (ab != NULL) free_aiotx_buffer(ab); else aio_cancel(job); } int t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); /* This only handles writes. */ if (job->uaiocb.aio_lio_opcode != LIO_WRITE) return (EOPNOTSUPP); if (!sc->tt.tx_zcopy) return (EOPNOTSUPP); if (tls_tx_key(toep)) return (EOPNOTSUPP); SOCKBUF_LOCK(&so->so_snd); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); #endif if (!aio_set_cancel_function(job, t4_aiotx_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); if (sowriteable(so)) t4_aiotx_queue_toep(toep); SOCKBUF_UNLOCK(&so->so_snd); return (0); } void aiotx_init_toep(struct toepcb *toep) { TAILQ_INIT(&toep->aiotx_jobq); TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); } #endif Index: head/sys/dev/cxgbe/tom/t4_listen.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_listen.c (revision 335923) +++ head/sys/dev/cxgbe/tom/t4_listen.c (revision 335924) @@ -1,1724 +1,1726 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *, struct offload_settings *); static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); static void send_reset_synqe(struct toedev *, struct synq_entry *); static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); if (n > t->nstids - t->stids_in_use) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { struct tom_data *td = sc->tom_softc; lctx->ce = hold_lip(td, &inp->in6p_laddr, NULL); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); TAILQ_INIT(&lctx->synq); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; struct tom_data *td = sc->tom_softc; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(TAILQ_EMPTY(&lctx->synq), ("%s: synq not empty.", __func__)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) release_lip(td, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) { struct adapter *sc = tod->tod_softc; struct mbuf *m = synqe->syn; struct ifnet *ifp = m->m_pkthdr.rcvif; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; struct wrqe *wr; struct fw_flowc_wr *flowc; struct cpl_abort_req *req; int txqid, rxqid, flowclen; struct sge_wrq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; get_qids_from_mbuf(m, &txqid, &rxqid); ofld_txq = &sc->sge.ofld_txq[txqid]; ofld_rxq = &sc->sge.ofld_rxq[rxqid]; /* The wrqe will have two WRs - a flowc followed by an abort_req */ flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE)); /* First the flowc ... */ memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; /* ... then ABORT request */ INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = CPL_ABORT_SEND_RST; t4_l2t_send(sc, wr, e); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tp->t_inpcb; struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff, inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && vi->ifp->if_capenable & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct synq_entry *synqe; INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { KASSERT(TAILQ_EMPTY(&lctx->synq), ("%s: synq not empty.", __func__)); return (EINPROGRESS); } /* * The host stack will abort all the connections on the listening * socket's so_comp. It doesn't know about the connections on the synq * so we need to take care of those. */ TAILQ_FOREACH(synqe, &lctx->synq, link) { if (synqe->flags & TPF_SYNQE_HAS_L2TE) send_reset_synqe(tod, synqe); } destroy_server(sc, lctx); return (0); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline void release_synqe(struct synq_entry *synqe) { if (refcount_release(&synqe->refcnt)) { int needfree = synqe->flags & TPF_SYNQE_NEEDFREE; m_freem(synqe->syn); if (needfree) free(synqe, M_CXGBE); } } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; release_synqe(synqe); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct wrqe *wr; struct l2t_entry *e; struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); if (wr == NULL) { m_freem(m); return (EALREADY); } if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->ts = to.to_tsval; if (chip_id(sc) >= CHELSIO_T5) { struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr); rpl5->iss = th->th_seq; } e = &sc->l2t->l2tab[synqe->l2e_idx]; t4_l2t_send(sc, wr, e); m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; if (status != CPL_ERR_NONE) log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc; struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; int ntids; INP_WLOCK_ASSERT(inp); ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(sc, lctx); if (inp) INP_WUNLOCK(inp); remove_tid(sc, synqe->tid, ntids); release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]); t4_l2t_release(e); release_synqe(synqe); /* removed from synq list */ } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; int txqid; struct sge_wrq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); get_qids_from_mbuf(synqe->syn, &txqid, NULL); ofld_txq = &sc->sge.ofld_txq[txqid]; /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; #ifdef INVARIANTS struct inpcb *inp = sotoinpcb(so); #endif struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); struct toepcb *toep = *(struct toepcb **)(cpl + 1); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); offload_socket(so, toep); make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; } static inline void save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi, struct offload_settings *s) { uint32_t txqid, rxqid; if (s->txq >= 0 && s->txq < vi->nofldtxq) txqid = s->txq; else txqid = arc4random() % vi->nofldtxq; txqid += vi->first_ofld_txq; if (s->rxq >= 0 && s->rxq < vi->nofldrxq) rxqid = s->rxq; else rxqid = arc4random() % vi->nofldrxq; rxqid += vi->first_ofld_rxq; m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); } static inline void get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) { if (txqid) *txqid = m->m_pkthdr.flowid >> 16; if (rxqid) *rxqid = m->m_pkthdr.flowid & 0xffff; } /* * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to * store some state temporarily. */ static struct synq_entry * mbuf_to_synqe(struct mbuf *m) { int len = roundup2(sizeof (struct synq_entry), 8); int tspace = M_TRAILINGSPACE(m); struct synq_entry *synqe = NULL; if (tspace < len) { synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); if (synqe == NULL) return (NULL); synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE; } else { synqe = (void *)(m->m_data + m->m_len + tspace - len); synqe->flags = TPF_SYNQE; } return (synqe); } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } /* * Options2 for passive open. */ static uint32_t calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, struct cc_algo *cc, const struct offload_settings *s) { struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; uint32_t opt2 = 0; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) opt2 |= F_SACK_EN; if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) opt2 |= F_TSTAMPS_EN; if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) opt2 |= F_WND_SCALE_EN; if (th->th_flags & (TH_ECE | TH_CWR) && (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) opt2 |= F_CCTRL_ECN; /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); /* These defaults are subject to ULP specific fixups later. */ opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); opt2 |= V_PACE(0); if (s->cong_algo >= 0) opt2 |= V_CONG_CNTRL(s->cong_algo); else if (sc->tt.cong_algorithm >= 0) opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { if (strcasecmp(cc->name, "reno") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); else if (strcasecmp(cc->name, "tahoe") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); if (strcasecmp(cc->name, "newreno") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); if (strcasecmp(cc->name, "highspeed") == 0) opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); } } if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); /* Note that ofld_rxq is already set according to s->rxq. */ opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (ulp_mode == ULP_MODE_TCPDDP) opt2 |= F_RX_FC_DDP; #endif if (ulp_mode == ULP_MODE_TLS) { opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } return (htobe32(opt2)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; if (inc->inc_flags & INC_ISIPV6) { struct nhop6_basic nh6; bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, 0, 0, &nh6) != 0) return (NULL); if (nh6.nh_ifp != ifp) return (NULL); ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr; } else { struct nhop4_basic nh4; dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0, &nh4) != 0) return (NULL); if (nh4.nh_ifp != ifp) return (NULL); ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr; } e = t4_l2t_get(pi, ifp, dst); return (e); } #define REJECT_PASS_ACCEPT() do { \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); struct cpl_pass_accept_rpl *rpl; struct wrqe *wr; unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; struct ifnet *hw_ifp, *ifp; struct l2t_entry *e = NULL; int rscale, mtu_idx, rx_credits, rxqid, ulp_mode; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid; u_int wnd; + struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); pass_accept_req_to_protohdrs(sc, m, &inc, &th); t4opt_to_tcpopt(&cpl->tcpopt, &to); pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; CURVNET_SET(lctx->vnet); /* * Use the MAC index to lookup the associated VI. If this SYN * didn't match a perfect MAC filter, punt. */ if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) { m_freem(m); m = NULL; REJECT_PASS_ACCEPT(); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info))) goto found; } m_freem(m); m = NULL; REJECT_PASS_ACCEPT(); found: hw_ifp = vi->ifp; /* the (v)cxgbeX ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT(); } else ifp = hw_ifp; /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. */ if (cpl->tcpopt.unknown) REJECT_PASS_ACCEPT(); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT(); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) REJECT_PASS_ACCEPT(); ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT(); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ if (!in_ifhasaddr(ifp, inc.inc_laddr)) REJECT_PASS_ACCEPT(); ntids = 1; } /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ if (lctx->vnet != ifp->if_vnet) REJECT_PASS_ACCEPT(); e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) REJECT_PASS_ACCEPT(); synqe = mbuf_to_synqe(m); if (synqe == NULL) REJECT_PASS_ACCEPT(); wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]); if (wr == NULL) REJECT_PASS_ACCEPT(); rpl = wrtod(wr); - INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for 4-tuple check */ /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); free(wr, M_CXGBE); REJECT_PASS_ACCEPT(); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); inp = lctx->inp; /* listening socket, not owned by TOE */ INP_WLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { /* * The listening socket has closed. The reply from the TOE to * our CPL_CLOSE_LISTSRV_REQ will ultimately release all * resources tied to this listen context. */ INP_WUNLOCK(inp); free(wr, M_CXGBE); REJECT_PASS_ACCEPT(); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_WUNLOCK(inp); free(wr, M_CXGBE); REJECT_PASS_ACCEPT(); } mtu_idx = find_best_mtu_idx(sc, &inc, &settings); rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); wnd = min(wnd, MAX_RCV_WND); rx_credits = min(wnd >> 10, M_RCV_BUFSIZ); save_qids_in_mbuf(m, vi, &settings); get_qids_from_mbuf(m, NULL, &rxqid); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); } ulp_mode = select_ulp_mode(so, sc, &settings); switch (ulp_mode) { case ULP_MODE_TCPDDP: synqe->flags |= TPF_SYNQE_TCPDDP; break; case ULP_MODE_TLS: synqe->flags |= TPF_SYNQE_TLS; break; } rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode, &settings); rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode, CC_ALGO(intotcpcb(inp)), &settings); synqe->tid = tid; synqe->lctx = lctx; synqe->syn = m; m = NULL; refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */ synqe->l2e_idx = e->idx; synqe->rcv_bufsize = rx_credits; atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); insert_tid(sc, tid, synqe, ntids); TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); hold_synqe(synqe); /* hold for the duration it's in the synq */ hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ toe_syncache_add(&inc, &to, &th, inp, tod, synqe); INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ /* * If we replied during syncache_add (synqe->wr has been consumed), * good. Otherwise, set it to 0 so that further syncache_respond * attempts by the kernel will be ignored. */ if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { /* * syncache may or may not have a hold on the synqe, which may * or may not be stashed in the original SYN mbuf passed to us. * Just copy it over instead of dealing with all possibilities. */ m = m_dup(synqe->syn, M_NOWAIT); if (m) m->m_pkthdr.rcvif = hw_ifp; remove_tid(sc, synqe->tid, ntids); free(wr, M_CXGBE); /* Yank the synqe out of the lctx synq. */ INP_WLOCK(inp); TAILQ_REMOVE(&lctx->synq, synqe, link); release_synqe(synqe); /* removed from synq list */ inp = release_lctx(sc, lctx); if (inp) INP_WUNLOCK(inp); release_synqe(synqe); /* extra hold */ REJECT_PASS_ACCEPT(); } CTR6(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK mode %d", __func__, stid, tid, lctx, synqe, ulp_mode); INP_WLOCK(inp); synqe->flags |= TPF_SYNQE_HAS_L2TE; if (__predict_false(inp->inp_flags & INP_DROPPED)) { /* * Listening socket closed but tod_listen_stop did not abort * this tid because there was no L2T entry for the tid at that * time. Abort it now. The reply to the abort will clean up. */ CTR6(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT", __func__, stid, tid, lctx, synqe, synqe->flags); if (!(synqe->flags & TPF_SYNQE_EXPANDED)) send_reset_synqe(tod, synqe); INP_WUNLOCK(inp); CURVNET_RESTORE(); release_synqe(synqe); /* extra hold */ return (__LINE__); } INP_WUNLOCK(inp); CURVNET_RESTORE(); release_synqe(synqe); /* extra hold */ return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (__predict_true(m != NULL)) { m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; hw_ifp->if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; struct ifnet *ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; + struct epoch_tracker et; u_int txqid, rxqid; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); - INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); if (__predict_false(inp->inp_flags & INP_DROPPED)) { if (synqe->flags & TPF_SYNQE_HAS_L2TE) { KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: listen socket closed but tid %u not aborted.", __func__, tid)); } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } ifp = synqe->syn->m_pkthdr.rcvif; vi = ifp->if_softc; KASSERT(vi->pi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT); if (toep == NULL) { reset: /* * The reply to this abort will perform final cleanup. There is * no need to check for HAS_L2TE here. We can be here only if * we responded to the PASS_ACCEPT_REQ, and our response had the * L2T idx. */ send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; if (synqe->flags & TPF_SYNQE_TCPDDP) set_ulp_mode(toep, ULP_MODE_TCPDDP); else if (synqe->flags & TPF_SYNQE_TLS) set_ulp_mode(toep, ULP_MODE_TLS); else set_ulp_mode(toep, ULP_MODE_NONE); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = synqe->rcv_bufsize; so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); /* * No more need for anything in the mbuf that carried the * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer * there. XXX: bad form but I don't want to increase the size of synqe. */ m = synqe->syn; KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); bcopy(cpl, mtod(m, void *), sizeof(*cpl)); *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); toep->vnet = lctx->vnet; if (inc.inc_flags & INC_ISIPV6) toep->ce = hold_lip(sc->tom_softc, &inc.inc6_laddr, lctx->ce); /* * This is for the unlikely case where the syncache entry that we added * has been evicted from the syncache, but the syncache_expand above * works because of syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); release_synqe(synqe); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif Index: head/sys/dev/cxgbe/tom/t4_tls.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_tls.c (revision 335923) +++ head/sys/dev/cxgbe/tom/t4_tls.c (revision 335924) @@ -1,1661 +1,1663 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while * the mbuf is in the ulp_pdu_reclaimq. */ #define tls_tcp_seq PH_loc.thirtytwo[0] /* * Handshake lock used for the handshake timer. Having a global lock * is perhaps not ideal, but it avoids having to use callout_drain() * in tls_uninit_toep() which can't block. Also, the timer shouldn't * actually fire for most connections. */ static struct mtx tls_handshake_lock; static void t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0); } /* TLS and DTLS common routines */ bool can_tls_offload(struct adapter *sc) { return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS); } int tls_tx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->tx_key_addr >= 0); } int tls_rx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->rx_key_addr >= 0); } static int key_size(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ? tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE); } /* Set TLS Key-Id in TCB */ static void t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id) { t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG, V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG), V_TCB_RX_TLS_KEY_TAG(key_id)); } /* Clear TF_RX_QUIESCE to re-enable receive. */ static void t4_clear_rx_quiesce(struct toepcb *toep) { t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); } static void tls_clr_ofld_mode(struct toepcb *toep) { tls_stop_handshake_timer(toep); /* Operate in PDU extraction mode only. */ t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1))); t4_clear_rx_quiesce(toep); } static void tls_clr_quiesce(struct toepcb *toep) { tls_stop_handshake_timer(toep); t4_clear_rx_quiesce(toep); } /* * Calculate the TLS data expansion size */ static int tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only, unsigned short *pdus_per_ulp) { struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_scmd *scmd = &tls_ofld->scmd0; int expn_size = 0, frag_count = 0, pad_per_pdu = 0, pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0; int exp_per_pdu = 0; int hdr_len = TLS_HEADER_LENGTH; do { max_frag_size = tls_ofld->k_ctx.frag_size; if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) == SCMD_CIPH_MODE_AES_GCM) { frag_count = (data_len / max_frag_size); exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE + hdr_len; expn_size = frag_count * exp_per_pdu; if (full_pdus_only) { *pdus_per_ulp = data_len / (exp_per_pdu + max_frag_size); if (*pdus_per_ulp > 32) *pdus_per_ulp = 32; else if(!*pdus_per_ulp) *pdus_per_ulp = 1; expn_size = (*pdus_per_ulp) * exp_per_pdu; break; } if ((last_frag_size = data_len % max_frag_size) > 0) { frag_count += 1; expn_size += exp_per_pdu; } break; } else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) != SCMD_CIPH_MODE_NOP) { /* Calculate the number of fragments we can make */ frag_count = (data_len / max_frag_size); if (frag_count > 0) { pad_per_pdu = (((howmany((max_frag_size + tls_ofld->mac_length), CIPHER_BLOCK_SIZE)) * CIPHER_BLOCK_SIZE) - (max_frag_size + tls_ofld->mac_length)); if (!pad_per_pdu) pad_per_pdu = CIPHER_BLOCK_SIZE; exp_per_pdu = pad_per_pdu + tls_ofld->mac_length + hdr_len + CIPHER_BLOCK_SIZE; expn_size = frag_count * exp_per_pdu; } if (full_pdus_only) { *pdus_per_ulp = data_len / (exp_per_pdu + max_frag_size); if (*pdus_per_ulp > 32) *pdus_per_ulp = 32; else if (!*pdus_per_ulp) *pdus_per_ulp = 1; expn_size = (*pdus_per_ulp) * exp_per_pdu; break; } /* Consider the last fragment */ if ((last_frag_size = data_len % max_frag_size) > 0) { pad_last_pdu = (((howmany((last_frag_size + tls_ofld->mac_length), CIPHER_BLOCK_SIZE)) * CIPHER_BLOCK_SIZE) - (last_frag_size + tls_ofld->mac_length)); if (!pad_last_pdu) pad_last_pdu = CIPHER_BLOCK_SIZE; expn_size += (pad_last_pdu + tls_ofld->mac_length + hdr_len + CIPHER_BLOCK_SIZE); } } } while (0); return (expn_size); } /* Copy Key to WR */ static void tls_copy_tx_key(struct toepcb *toep, void *dst) { struct tls_ofld_info *tls_ofld = &toep->tls; struct ulptx_sc_memrd *sc_memrd; struct ulptx_idata *sc; if (tls_ofld->k_ctx.tx_key_info_size <= 0) return; if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) { sc = dst; sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htobe32(0); sc_memrd = (struct ulptx_sc_memrd *)(sc + 1); sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4)); sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5); } else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) { memcpy(dst, &tls_ofld->k_ctx.tx, tls_ofld->k_ctx.tx_key_info_size); } } /* TLS/DTLS content type for CPL SFO */ static inline unsigned char tls_content_type(unsigned char content_type) { /* * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and * default to "CUSTOM" for all other types including * heartbeat? */ switch (content_type) { case CONTENT_TYPE_CCS: return CPL_TX_TLS_SFO_TYPE_CCS; case CONTENT_TYPE_ALERT: return CPL_TX_TLS_SFO_TYPE_ALERT; case CONTENT_TYPE_HANDSHAKE: return CPL_TX_TLS_SFO_TYPE_HANDSHAKE; case CONTENT_TYPE_HEARTBEAT: return CPL_TX_TLS_SFO_TYPE_HEARTBEAT; } return CPL_TX_TLS_SFO_TYPE_DATA; } static unsigned char get_cipher_key_size(unsigned int ck_size) { switch (ck_size) { case AES_NOP: /* NOP */ return 15; case AES_128: /* AES128 */ return CH_CK_SIZE_128; case AES_192: /* AES192 */ return CH_CK_SIZE_192; case AES_256: /* AES256 */ return CH_CK_SIZE_256; default: return CH_CK_SIZE_256; } } static unsigned char get_mac_key_size(unsigned int mk_size) { switch (mk_size) { case SHA_NOP: /* NOP */ return CH_MK_SIZE_128; case SHA_GHASH: /* GHASH */ case SHA_512: /* SHA512 */ return CH_MK_SIZE_512; case SHA_224: /* SHA2-224 */ return CH_MK_SIZE_192; case SHA_256: /* SHA2-256*/ return CH_MK_SIZE_256; case SHA_384: /* SHA384 */ return CH_MK_SIZE_512; case SHA1: /* SHA1 */ default: return CH_MK_SIZE_160; } } static unsigned int get_proto_ver(int proto_ver) { switch (proto_ver) { case TLS1_2_VERSION: return TLS_1_2_VERSION; case TLS1_1_VERSION: return TLS_1_1_VERSION; case DTLS1_2_VERSION: return DTLS_1_2_VERSION; default: return TLS_VERSION_MAX; } } static void tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx) { if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { kwr->u.rxhdr.ivinsert_to_authinsrt = htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) | V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) | V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) | V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) | V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL)); kwr->u.rxhdr.ivpresent_to_rxmk_size &= ~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1)); kwr->u.rxhdr.authmode_to_rxvalid &= ~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1)); } else { kwr->u.rxhdr.ivinsert_to_authinsrt = htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) | V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) | V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) | V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) | V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) | V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) | V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL)); } } /* Rx key */ static void prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx) { unsigned int ck_size = kctx->cipher_secret_size; unsigned int mk_size = kctx->mac_secret_size; int proto_ver = kctx->proto_ver; kwr->u.rxhdr.flitcnt_hmacctrl = ((kctx->tx_key_info_size >> 4) << 3) | kctx->hmac_ctrl; kwr->u.rxhdr.protover_ciphmode = V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) | V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode); kwr->u.rxhdr.authmode_to_rxvalid = V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) | V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) | V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) | V_TLS_KEYCTX_TX_WR_RXVALID(1); kwr->u.rxhdr.ivpresent_to_rxmk_size = V_TLS_KEYCTX_TX_WR_IVPRESENT(0) | V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) | V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) | V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size)); tls_rxkey_flit1(kwr, kctx); /* No key reversal for GCM */ if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) { t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key, (kctx->cipher_secret_size << 3)); memcpy(kwr->keys.edkey + kctx->cipher_secret_size, kctx->rx.key + kctx->cipher_secret_size, (IPAD_SIZE + OPAD_SIZE)); } else { memcpy(kwr->keys.edkey, kctx->rx.key, (kctx->tx_key_info_size - SALT_SIZE)); memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE); } } /* Tx key */ static void prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx) { unsigned int ck_size = kctx->cipher_secret_size; unsigned int mk_size = kctx->mac_secret_size; kwr->u.txhdr.ctxlen = (kctx->tx_key_info_size >> 4); kwr->u.txhdr.dualck_to_txvalid = V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) | V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) | V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) | V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) | V_TLS_KEYCTX_TX_WR_TXVALID(1); memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE); if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE); kwr->u.txhdr.dualck_to_txvalid &= ~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1)); } kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid); } /* TLS Key memory management */ int tls_init_kmap(struct adapter *sc, struct tom_data *td) { td->key_map = vmem_create("T4TLS key map", sc->vres.key.start, sc->vres.key.size, 8, 0, M_FIRSTFIT | M_NOWAIT); if (td->key_map == NULL) return (ENOMEM); return (0); } void tls_free_kmap(struct tom_data *td) { if (td->key_map != NULL) vmem_destroy(td->key_map); } static int get_new_keyid(struct toepcb *toep, struct tls_key_context *k_ctx) { struct tom_data *td = toep->td; vmem_addr_t addr; if (vmem_alloc(td->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT, &addr) != 0) return (-1); return (addr); } static void free_keyid(struct toepcb *toep, int keyid) { struct tom_data *td = toep->td; vmem_free(td->key_map, keyid, TLS_KEY_CONTEXT_SZ); } static void clear_tls_keyid(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; if (tls_ofld->rx_key_addr >= 0) { free_keyid(toep, tls_ofld->rx_key_addr); tls_ofld->rx_key_addr = -1; } if (tls_ofld->tx_key_addr >= 0) { free_keyid(toep, tls_ofld->tx_key_addr); tls_ofld->tx_key_addr = -1; } } static int get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops) { return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr : ((ops & KEY_WRITE_TX) ? tls_ofld->tx_key_addr : -1)); } static int get_tp_plen_max(struct tls_ofld_info *tls_ofld) { int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448; return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX); } /* Send request to get the key-id */ static int tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); struct ofld_tx_sdesc *txsd; int kwrlen, kctxlen, keyid, len; struct wrqe *wr; struct tls_key_req *kwr; struct tls_keyctx *kctx; kwrlen = roundup2(sizeof(*kwr), 16); kctxlen = roundup2(sizeof(*kctx), 32); len = kwrlen + kctxlen; if (toep->txsd_avail == 0) return (EAGAIN); /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) { if ((keyid = get_new_keyid(toep, k_ctx)) < 0) { return (ENOSPC); } } else { keyid = get_keyid(tls_ofld, k_ctx->l_p_key); } wr = alloc_wrqe(len, toep->ofld_txq); if (wr == NULL) { free_keyid(toep, keyid); return (ENOMEM); } kwr = wrtod(wr); memset(kwr, 0, kwrlen); kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL | F_FW_WR_ATOMIC); kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) | V_FW_WR_FLOWID(toep->tid)); kwr->protocol = get_proto_ver(k_ctx->proto_ver); kwr->mfs = htons(k_ctx->frag_size); kwr->reneg_to_write_rx = k_ctx->l_p_key; /* master command */ kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1)); kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5)); kwr->len16 = htobe32((toep->tid << 8) | DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16)); kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5)); /* sub command */ kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); kwr->sc_len = htobe32(kctxlen); /* XXX: This assumes that kwrlen == sizeof(*kwr). */ kctx = (struct tls_keyctx *)(kwr + 1); memset(kctx, 0, kctxlen); if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) { tls_ofld->tx_key_addr = keyid; prepare_txkey_wr(kctx, k_ctx); } else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { tls_ofld->rx_key_addr = keyid; prepare_rxkey_wr(kctx, k_ctx); } txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = DIV_ROUND_UP(len, 16); txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); return (0); } /* Store a key received from SSL in DDR. */ static int program_key_context(struct tcpcb *tp, struct toepcb *toep, struct tls_key_context *uk_ctx) { struct adapter *sc = td_adapter(toep->td); struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_key_context *k_ctx; int error, key_offset; if (tp->t_state != TCPS_ESTABLISHED) { /* * XXX: Matches Linux driver, but not sure this is a * very appropriate error. */ return (ENOENT); } /* Stop timer on handshake completion */ tls_stop_handshake_timer(toep); toep->flags &= ~TPF_FORCE_CREDITS; CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid, G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" : "KEY_WRITE_TX", uk_ctx->proto_ver); if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX && toep->ulp_mode != ULP_MODE_TLS) return (EOPNOTSUPP); /* Don't copy the 'tx' and 'rx' fields. */ k_ctx = &tls_ofld->k_ctx; memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key, sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key)); /* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */ if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) { if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { tls_ofld->rx_key_addr = -1; t4_clear_rx_quiesce(toep); } else { tls_ofld->tx_key_addr = -1; } return (0); } if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) { k_ctx->iv_size = 4; k_ctx->mac_first = 0; k_ctx->hmac_ctrl = 0; } else { k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */ k_ctx->mac_first = 1; } tls_ofld->scmd0.seqno_numivs = (V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) | V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) | V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) | V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) | V_SCMD_IV_SIZE(k_ctx->iv_size)); tls_ofld->scmd0.ivgen_hdrlen = (V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) | V_SCMD_KEY_CTX_INLINE(0) | V_SCMD_TLS_FRAG_ENABLE(1)); tls_ofld->mac_length = k_ctx->mac_secret_size; if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { k_ctx->rx = uk_ctx->rx; /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) tls_ofld->rx_key_addr = -1; } else { k_ctx->tx = uk_ctx->tx; /* Dont initialize key for re-neg */ if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) tls_ofld->tx_key_addr = -1; } /* Flush pending data before new Tx key becomes active */ if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) { struct sockbuf *sb; /* XXX: This might not drain everything. */ t4_push_frames(sc, toep, 0); sb = &toep->inp->inp_socket->so_snd; SOCKBUF_LOCK(sb); /* XXX: This asserts that everything has been pushed. */ MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL); sb->sb_sndptr = NULL; tls_ofld->sb_off = sbavail(sb); SOCKBUF_UNLOCK(sb); tls_ofld->tx_seq_no = 0; } if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) || (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) { error = tls_program_key_id(toep, k_ctx); if (error) { /* XXX: Only clear quiesce for KEY_WRITE_RX? */ t4_clear_rx_quiesce(toep); return (error); } } if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) { /* * RX key tags are an index into the key portion of MA * memory stored as an offset from the base address in * units of 64 bytes. */ key_offset = tls_ofld->rx_key_addr - sc->vres.key.start; t4_set_tls_keyid(toep, key_offset / 64); t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) | V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1)))); t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ, V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(0)); t4_clear_rx_quiesce(toep); } else { unsigned short pdus_per_ulp; if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) tls_ofld->tx_key_addr = 1; tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld); tls_ofld->expn_per_ulp = tls_expansion_size(toep, tls_ofld->fcplenmax, 1, &pdus_per_ulp); tls_ofld->pdus_per_ulp = pdus_per_ulp; tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp * ((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) + tls_ofld->k_ctx.frag_size); } return (0); } /* * In some cases a client connection can hang without sending the * ServerHelloDone message from the NIC to the host. Send a dummy * RX_DATA_ACK with RX_MODULATE to unstick the connection. */ static void tls_send_handshake_ack(void *arg) { struct toepcb *toep = arg; struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); /* * XXX: Does not have the t4_get_tcb() checks to refine the * workaround. */ callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz); CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid); send_rx_modulate(sc, toep); } static void tls_start_handshake_timer(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; mtx_lock(&tls_handshake_lock); callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz, tls_send_handshake_ack, toep); mtx_unlock(&tls_handshake_lock); } void tls_stop_handshake_timer(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; mtx_lock(&tls_handshake_lock); callout_stop(&tls_ofld->handshake_timer); mtx_unlock(&tls_handshake_lock); } int t4_ctloutput_tls(struct socket *so, struct sockopt *sopt) { struct tls_key_context uk_ctx; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; int error, optval; error = 0; if (sopt->sopt_dir == SOPT_SET && sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) { error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx), sizeof(uk_ctx)); if (error) return (error); } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); toep = tp->t_toe; switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case TCP_TLSOM_SET_TLS_CONTEXT: error = program_key_context(tp, toep, &uk_ctx); INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_TLS_TOM: if (toep->ulp_mode == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM", __func__, toep->tid); tls_clr_ofld_mode(toep); } else error = EOPNOTSUPP; INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_QUIES: if (toep->ulp_mode == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES", __func__, toep->tid); tls_clr_quiesce(toep); } else error = EOPNOTSUPP; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = EOPNOTSUPP; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case TCP_TLSOM_GET_TLS_TOM: /* * TLS TX is permitted on any TOE socket, but * TLS RX requires a TLS ULP mode. */ optval = TLS_TOM_NONE; if (can_tls_offload(td_adapter(toep->td))) { switch (toep->ulp_mode) { case ULP_MODE_NONE: case ULP_MODE_TCPDDP: optval = TLS_TOM_TXONLY; break; case ULP_MODE_TLS: optval = TLS_TOM_BOTH; break; } } CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d", __func__, toep->tid, optval); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = EOPNOTSUPP; break; } break; } return (error); } void tls_init_toep(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR; tls_ofld->rx_key_addr = -1; tls_ofld->tx_key_addr = -1; if (toep->ulp_mode == ULP_MODE_TLS) callout_init_mtx(&tls_ofld->handshake_timer, &tls_handshake_lock, 0); } void tls_establish(struct toepcb *toep) { /* * Enable PDU extraction. * * XXX: Supposedly this should be done by the firmware when * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but * in practice this seems to be required. */ CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid); t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1))); toep->flags |= TPF_FORCE_CREDITS; tls_start_handshake_timer(toep); } void tls_uninit_toep(struct toepcb *toep) { if (toep->ulp_mode == ULP_MODE_TLS) tls_stop_handshake_timer(toep); clear_tls_keyid(toep); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TLSTX_CREDITS(toep) \ (howmany(sizeof(struct fw_tlstx_data_wr) + \ sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) + \ CIPHER_BLOCK_SIZE + 1, 16)) static inline u_int max_imm_tls_space(int tx_credits) { const int n = 2; /* Use only up to 2 desc for imm. data WR */ int space; KASSERT(tx_credits >= 0 && tx_credits <= MAX_OFLD_TX_CREDITS, ("%s: %d credits", __func__, tx_credits)); if (tx_credits >= (n * EQ_ESIZE) / 16) space = (n * EQ_ESIZE); else space = tx_credits * 16; return (space); } static int count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp) { int max_nsegs_1mbuf, n, nsegs; while (skip >= m->m_len) { skip -= m->m_len; m = m->m_next; } nsegs = 0; max_nsegs_1mbuf = 0; while (len > 0) { n = sglist_count(mtod(m, char *) + skip, m->m_len - skip); if (n > max_nsegs_1mbuf) max_nsegs_1mbuf = n; nsegs += n; len -= m->m_len - skip; skip = 0; m = m->m_next; } *max_nsegs_1mbufp = max_nsegs_1mbuf; return (nsegs); } static void write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep, unsigned int immdlen, unsigned int plen, unsigned int expn, unsigned int pdus, uint8_t credits, int shove, int imm_ivs) { struct tls_ofld_info *tls_ofld = &toep->tls; unsigned int len = plen + expn; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) | V_FW_TLSTX_DATA_WR_COMPL(1) | V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) | V_FW_TLSTX_DATA_WR_LEN16(credits)); txwr->plen = htobe32(len); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) | V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove)); txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) | V_FW_TLSTX_DATA_WR_EXP(expn) | V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) | V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) | V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4)); txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size); txwr->adjustedplen_pkd = htobe16( V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen)); txwr->expinplenmax_pkd = htobe16( V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp)); txwr->pdusinplenmax_pkd = htobe16( V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp)); } static void write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep, struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus) { struct tls_ofld_info *tls_ofld = &toep->tls; int data_type, seglen; if (plen < tls_ofld->k_ctx.frag_size) seglen = plen; else seglen = tls_ofld->k_ctx.frag_size; data_type = tls_content_type(tls_hdr->type); cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) | V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) | V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen)); cpl->pld_len = htobe32(plen); if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT) cpl->type_protover = htobe32( V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type)); cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs | V_SCMD_NUM_IVS(pdus)); cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen); cpl->scmd1 = htobe64(tls_ofld->tx_seq_no); tls_ofld->tx_seq_no += pdus; } /* * Similar to write_tx_sgl() except that it accepts an optional * trailer buffer for IVs. */ static void write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen, void *iv_buffer, int iv_len, int nsegs, int n) { struct mbuf *m; struct ulptx_sgl *usgl = dst; int i, j, rc; struct sglist sg; struct sglist_seg segs[n]; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); sglist_init(&sg, n, segs); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); for (m = start; skip >= m->m_len; m = m->m_next) skip -= m->m_len; i = -1; for (m = start; plen > 0; m = m->m_next) { rc = sglist_append(&sg, mtod(m, char *) + skip, m->m_len - skip); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); plen -= m->m_len - skip; skip = 0; for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } sglist_reset(&sg); } if (iv_buffer != NULL) { rc = sglist_append(&sg, iv_buffer, iv_len); if (__predict_false(rc != 0)) panic("%s: sglist_append %d", __func__, rc); for (j = 0; j < sg.sg_nseg; i++, j++) { if (i < 0) { usgl->len0 = htobe32(segs[j].ss_len); usgl->addr0 = htobe64(segs[j].ss_paddr); } else { usgl->sge[i / 2].len[i & 1] = htobe32(segs[j].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[j].ss_paddr); } #ifdef INVARIANTS nsegs--; #endif } } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p", __func__, nsegs, start, iv_buffer)); } /* * Similar to t4_push_frames() but handles TLS sockets when TLS offload * is enabled. Rather than transmitting bulk data, the socket buffer * contains TLS records. The work request requires a full TLS record, * so batch mbufs up until a full TLS record is seen. This requires * reading the TLS header out of the start of each record to determine * its length. */ void t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop) { struct tls_hdr thdr; struct mbuf *sndptr; struct fw_tlstx_data_wr *txwr; struct cpl_tx_tls_sfo *cpl; struct wrqe *wr; u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len; u_int expn_size, iv_len, pdus, sndptroff; struct tls_ofld_info *tls_ofld = &toep->tls; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tls_size, tx_credits, shove, /* compl,*/ sowwakeup; struct ofld_tx_sdesc *txsd; bool imm_ivs, imm_payload; void *iv_buffer, *iv_dst, *buf; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(toep->ulp_mode == ULP_MODE_NONE || toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; for (;;) { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); space = max_imm_tls_space(tx_credits); wr_len = sizeof(struct fw_tlstx_data_wr) + sizeof(struct cpl_tx_tls_sfo) + key_size(toep); if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) { #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d tx_credits %d min_wr %d space %d", __func__, toep->tid, tx_credits, wr_len + CIPHER_BLOCK_SIZE + 1, space); #endif return; } SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); MPASS(tls_ofld->sb_off >= drop); tls_ofld->sb_off -= drop; drop = 0; } /* * Send a FIN if requested, but only if there's no * more data to send. */ if (sbavail(sb) == tls_ofld->sb_off && toep->flags & TPF_SEND_FIN) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); t4_close_conn(sc, toep); return; } if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) { /* * A full TLS header is not yet queued, stop * for now until more data is added to the * socket buffer. However, if the connection * has been closed, we will never get the rest * of the header so just discard the partial * header and close the connection. */ #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d%s", __func__, toep->tid, sbavail(sb), tls_ofld->sb_off, toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN"); #endif if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); return; } /* Read the header of the next TLS record. */ sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff); MPASS(!IS_AIOTX_MBUF(sndptr)); m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr); tls_size = htons(thdr.length); plen = TLS_HEADER_LENGTH + tls_size; pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size); iv_len = pdus * CIPHER_BLOCK_SIZE; if (sbavail(sb) < tls_ofld->sb_off + plen) { /* * The full TLS record is not yet queued, stop * for now until more data is added to the * socket buffer. However, if the connection * has been closed, we will never get the rest * of the record so just discard the partial * record and close the connection. */ #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d plen %d%s", __func__, toep->tid, sbavail(sb), tls_ofld->sb_off, plen, toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN"); #endif if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (toep->flags & TPF_SEND_FIN) t4_close_conn(sc, toep); return; } /* Shove if there is no additional data pending. */ shove = (sbavail(sb) == tls_ofld->sb_off + plen) && !(tp->t_flags & TF_MORETOCOME); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); /* Determine whether to use immediate vs SGL. */ imm_payload = false; imm_ivs = false; if (wr_len + iv_len <= space) { imm_ivs = true; wr_len += iv_len; if (wr_len + tls_size <= space) { wr_len += tls_size; imm_payload = true; } } /* Allocate space for IVs if needed. */ if (!imm_ivs) { iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT); if (iv_buffer == NULL) { /* * XXX: How to restart this? */ if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); CTR3(KTR_CXGBE, "%s: tid %d failed to alloc IV space len %d", __func__, toep->tid, iv_len); return; } } else iv_buffer = NULL; /* Determine size of SGL. */ nsegs = 0; max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ if (!imm_payload) { nsegs = count_mbuf_segs(sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf); if (!imm_ivs) { int n = sglist_count(iv_buffer, iv_len); nsegs += n; if (n > max_nsegs_1mbuf) max_nsegs_1mbuf = n; } /* Account for SGL in work request length. */ wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; } wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d", __func__, toep->tid, thdr.type, tls_size, pdus); #endif txwr = wrtod(wr); cpl = (struct cpl_tx_tls_sfo *)(txwr + 1); memset(txwr, 0, roundup2(wr_len, 16)); credits = howmany(wr_len, 16); expn_size = tls_expansion_size(toep, tls_size, 0, NULL); write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0, tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0); write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus); tls_copy_tx_key(toep, cpl + 1); /* Generate random IVs */ buf = (char *)(cpl + 1) + key_size(toep); if (imm_ivs) { MPASS(iv_buffer == NULL); iv_dst = buf; buf = (char *)iv_dst + iv_len; } else iv_dst = iv_buffer; arc4rand(iv_dst, iv_len, 0); if (imm_payload) { m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, buf); } else { write_tlstx_sgl(buf, sndptr, sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer, iv_len, nsegs, max_nsegs_1mbuf); } KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; tp->snd_nxt += plen; tp->snd_max += plen; SOCKBUF_LOCK(sb); sbsndptr(sb, tls_ofld->sb_off, plen, &sndptroff); tls_ofld->sb_off += plen; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep)) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = plen; txsd->tx_credits = credits; txsd->iv_buffer = iv_buffer; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; atomic_add_long(&toep->vi->pi->tx_tls_records, 1); atomic_add_long(&toep->vi->pi->tx_tls_octets, plen); t4_l2t_send(sc, wr, toep->l2te); } } /* * For TLS data we place received mbufs received via CPL_TLS_DATA into * an mbufq in the TLS offload state. When CPL_RX_TLS_CMP is * received, the completed PDUs are placed into the socket receive * buffer. * * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs. */ static int do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_tls_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; int len; /* XXX: Should this match do_rx_data instead? */ KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; atomic_add_long(&toep->vi->pi->rx_tls_octets, len); KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } /* Save TCP sequence number. */ m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq); if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) { #ifdef INVARIANTS panic("Failed to queue TLS data packet"); #else printf("%s: Failed to queue TLS data packet\n", __func__); INP_WUNLOCK(inp); m_freem(m); return (0); #endif } tp = intotcpcb(inp); tp->t_rcvtime = ticks; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, be32toh(cpl->seq)); #endif INP_WUNLOCK(inp); return (0); } static int do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *); struct tlsrx_hdr_pkt *tls_hdr_pkt; unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; int len, pdu_length, pdu_overhead, sb_length; KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; atomic_add_long(&toep->vi->pi->rx_tls_records, 1); KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length)); tp = intotcpcb(inp); #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u", __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt); #endif tp->rcv_nxt += pdu_length; if (tp->rcv_wnd < pdu_length) { toep->tls.rcv_over += pdu_length - tp->rcv_wnd; tp->rcv_wnd = 0; } else tp->rcv_wnd -= pdu_length; /* XXX: Not sure what to do about urgent data. */ /* * The payload of this CPL is the TLS header followed by * additional fields. */ KASSERT(m->m_len >= sizeof(*tls_hdr_pkt), ("%s: payload too small", __func__)); tls_hdr_pkt = mtod(m, void *); /* * Only the TLS header is sent to OpenSSL, so report errors by * altering the record type. */ if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) tls_hdr_pkt->type = CONTENT_TYPE_ERROR; /* Trim this CPL's mbuf to only include the TLS header. */ KASSERT(m->m_len == len && m->m_next == NULL, ("%s: CPL spans multiple mbufs", __func__)); m->m_len = TLS_HEADER_LENGTH; m->m_pkthdr.len = TLS_HEADER_LENGTH; tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq); if (tls_data != NULL) { KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq, ("%s: sequence mismatch", __func__)); /* * Update the TLS header length to be the length of * the payload data. */ tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len); m->m_next = tls_data; m->m_pkthdr.len += tls_data->m_len; } so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { + struct epoch_tracker et; + CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, pdu_length); m_freem(m); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return (0); } /* * Not all of the bytes on the wire are included in the socket * buffer (e.g. the MAC of the TLS record). However, those * bytes are included in the TCP sequence space. To handle * this, compute the delta for this TLS record in * 'pdu_overhead' and treat those bytes as having already been * "read" by the application for the purposes of expanding the * window. The meat of the TLS record passed to the * application ('sb_length') will still not be counted as * "read" until userland actually reads the bytes. * * XXX: Some of the calculations below are probably still not * really correct. */ sb_length = m->m_pkthdr.len; pdu_overhead = pdu_length - sb_length; toep->rx_credits += pdu_overhead; tp->rcv_wnd += pdu_overhead; tp->rcv_adv += pdu_overhead; /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && sb_length > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else toep->rx_credits += newsize - hiwat; } KASSERT(toep->sb_cc >= sbused(sb), ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sbused(sb), toep->sb_cc)); toep->rx_credits += toep->sb_cc - sbused(sb); sbappendstream_locked(sb, m, 0); toep->sb_cc = sbused(sb); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u", __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd); #endif if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { int credits; credits = send_rx_credits(sc, toep, toep->rx_credits); toep->rx_credits -= credits; tp->rcv_wnd += credits; tp->rcv_adv += credits; } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void t4_tls_mod_load(void) { mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF); t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data); t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp); } void t4_tls_mod_unload(void) { t4_register_cpl_handler(CPL_TLS_DATA, NULL); t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL); mtx_destroy(&tls_handshake_lock); } #endif /* TCP_OFFLOAD */ Index: head/sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- head/sys/dev/hwpmc/hwpmc_mod.c (revision 335923) +++ head/sys/dev/hwpmc/hwpmc_mod.c (revision 335924) @@ -1,5979 +1,5982 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * Copyright (c) 2018 Matthew Macy * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needs to be after */ #include #include #include #include #include #include #include #include "hwpmc_soft.h" #ifdef NUMA #define NDOMAINS vm_ndomains #else #define NDOMAINS 1 #define malloc_domain(size, type, domain, flags) malloc((size), (type), (flags)) #define free_domain(addr, type) free(addr, type) #endif +#define PMC_EPOCH_ENTER() struct epoch_tracker pmc_et; epoch_enter_preempt(global_epoch_preempt, &pmc_et) +#define PMC_EPOCH_EXIT() epoch_exit_preempt(global_epoch_preempt, &pmc_et) + /* * Types */ enum pmc_flags { PMC_FLAG_NONE = 0x00, /* do nothing */ PMC_FLAG_REMOVE = 0x01, /* atomically remove entry from hash */ PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */ PMC_FLAG_NOWAIT = 0x04, /* do not wait for mallocs */ }; /* * The offset in sysent where the syscall is allocated. */ static int pmc_syscall_num = NO_SYSCALL; struct pmc_cpu **pmc_pcpu; /* per-cpu state */ pmc_value_t *pmc_pcpu_saved; /* saved PMC values: CSW handling */ #define PMC_PCPU_SAVED(C,R) pmc_pcpu_saved[(R) + md->pmd_npmc*(C)] struct mtx_pool *pmc_mtxpool; static int *pmc_pmcdisp; /* PMC row dispositions */ #define PMC_ROW_DISP_IS_FREE(R) (pmc_pmcdisp[(R)] == 0) #define PMC_ROW_DISP_IS_THREAD(R) (pmc_pmcdisp[(R)] > 0) #define PMC_ROW_DISP_IS_STANDALONE(R) (pmc_pmcdisp[(R)] < 0) #define PMC_MARK_ROW_FREE(R) do { \ pmc_pmcdisp[(R)] = 0; \ } while (0) #define PMC_MARK_ROW_STANDALONE(R) do { \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= (-pmc_cpu_max_active()), \ ("[pmc,%d] row disposition error", __LINE__)); \ } while (0) #define PMC_UNMARK_ROW_STANDALONE(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) #define PMC_MARK_ROW_THREAD(R) do { \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ } while (0) #define PMC_UNMARK_ROW_THREAD(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) /* various event handlers */ static eventhandler_tag pmc_exit_tag, pmc_fork_tag, pmc_kld_load_tag, pmc_kld_unload_tag; /* Module statistics */ struct pmc_driverstats pmc_stats; /* Machine/processor dependent operations */ static struct pmc_mdep *md; /* * Hash tables mapping owner processes and target threads to PMCs. */ struct mtx pmc_processhash_mtx; /* spin mutex */ static u_long pmc_processhashmask; static LIST_HEAD(pmc_processhash, pmc_process) *pmc_processhash; /* * Hash table of PMC owner descriptors. This table is protected by * the shared PMC "sx" lock. */ static u_long pmc_ownerhashmask; static LIST_HEAD(pmc_ownerhash, pmc_owner) *pmc_ownerhash; /* * List of PMC owners with system-wide sampling PMCs. */ static CK_LIST_HEAD(, pmc_owner) pmc_ss_owners; /* * List of free thread entries. This is protected by the spin * mutex. */ static struct mtx pmc_threadfreelist_mtx; /* spin mutex */ static LIST_HEAD(, pmc_thread) pmc_threadfreelist; static int pmc_threadfreelist_entries=0; #define THREADENTRY_SIZE \ (sizeof(struct pmc_thread) + (md->pmd_npmc * sizeof(struct pmc_threadpmcstate))) /* * Task to free thread descriptors */ static struct grouptask free_gtask; /* * A map of row indices to classdep structures. */ static struct pmc_classdep **pmc_rowindex_to_classdep; /* * Prototypes */ #ifdef HWPMC_DEBUG static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS); static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); static int pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); static int pmc_attach_process(struct proc *p, struct pmc *pm); static struct pmc *pmc_allocate_pmc_descriptor(void); static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p); static int pmc_attach_one_process(struct proc *p, struct pmc *pm); static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu); static int pmc_can_attach(struct pmc *pm, struct proc *p); static void pmc_capture_user_callchain(int cpu, int soft, struct trapframe *tf); static void pmc_cleanup(void); static int pmc_detach_process(struct proc *p, struct pmc *pm); static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags); static void pmc_destroy_owner_descriptor(struct pmc_owner *po); static void pmc_destroy_pmc_descriptor(struct pmc *pm); static void pmc_destroy_process_descriptor(struct pmc_process *pp); static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p); static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm); static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmc); static struct pmc_process *pmc_find_process_descriptor(struct proc *p, uint32_t mode); static struct pmc_thread *pmc_find_thread_descriptor(struct pmc_process *pp, struct thread *td, uint32_t mode); static void pmc_force_context_switch(void); static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp); static void pmc_log_all_process_mappings(struct pmc_owner *po); static void pmc_log_kernel_mappings(struct pmc *pm); static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p); static void pmc_maybe_remove_owner(struct pmc_owner *po); static void pmc_process_csw_in(struct thread *td); static void pmc_process_csw_out(struct thread *td); static void pmc_process_exit(void *arg, struct proc *p); static void pmc_process_fork(void *arg, struct proc *p1, struct proc *p2, int n); static void pmc_process_samples(int cpu, int soft); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); static void pmc_process_thread_userret(struct thread *td); static void pmc_remove_owner(struct pmc_owner *po); static void pmc_remove_process_descriptor(struct pmc_process *pp); static void pmc_restore_cpu_binding(struct pmc_binding *pb); static void pmc_save_cpu_binding(struct pmc_binding *pb); static void pmc_select_cpu(int cpu); static int pmc_start(struct pmc *pm); static int pmc_stop(struct pmc *pm); static int pmc_syscall_handler(struct thread *td, void *syscall_args); static struct pmc_thread *pmc_thread_descriptor_pool_alloc(void); static void pmc_thread_descriptor_pool_drain(void); static void pmc_thread_descriptor_pool_free(struct pmc_thread *pt); static void pmc_unlink_target_process(struct pmc *pmc, struct pmc_process *pp); static int generic_switch_in(struct pmc_cpu *pc, struct pmc_process *pp); static int generic_switch_out(struct pmc_cpu *pc, struct pmc_process *pp); static struct pmc_mdep *pmc_generic_cpu_initialize(void); static void pmc_generic_cpu_finalize(struct pmc_mdep *md); static void pmc_post_callchain_callback(void); static void pmc_process_threadcreate(struct thread *td); static void pmc_process_threadexit(struct thread *td); static void pmc_process_proccreate(struct proc *p); static void pmc_process_allproc(struct pmc *pm); /* * Kernel tunables and sysctl(8) interface. */ SYSCTL_DECL(_kern_hwpmc); SYSCTL_NODE(_kern_hwpmc, OID_AUTO, stats, CTLFLAG_RW, 0, "HWPMC stats"); /* Stats. */ SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_ignored, CTLFLAG_RW, &pmc_stats.pm_intr_ignored, "# of interrupts ignored"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_processed, CTLFLAG_RW, &pmc_stats.pm_intr_processed, "# of interrupts processed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_bufferfull, CTLFLAG_RW, &pmc_stats.pm_intr_bufferfull, "# of interrupts where buffer was full"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, syscalls, CTLFLAG_RW, &pmc_stats.pm_syscalls, "# of syscalls"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, syscall_errors, CTLFLAG_RW, &pmc_stats.pm_syscall_errors, "# of syscall_errors"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, buffer_requests, CTLFLAG_RW, &pmc_stats.pm_buffer_requests, "# of buffer requests"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, buffer_requests_failed, CTLFLAG_RW, &pmc_stats.pm_buffer_requests_failed, "# of buffer requests which failed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, log_sweeps, CTLFLAG_RW, &pmc_stats.pm_log_sweeps, "# of ?"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, merges, CTLFLAG_RW, &pmc_stats.pm_merges, "# of times kernel stack was found for user trace"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, overwrites, CTLFLAG_RW, &pmc_stats.pm_overwrites, "# of times a sample was overwritten before being logged"); static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_RDTUN, &pmc_callchaindepth, 0, "depth of call chain records"); char pmc_cpuid[64]; SYSCTL_STRING(_kern_hwpmc, OID_AUTO, cpuid, CTLFLAG_RD, pmc_cpuid, 0, "cpu version string"); #ifdef HWPMC_DEBUG struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS; char pmc_debugstr[PMC_DEBUG_STRSIZE]; TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr)); SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags"); #endif /* * kern.hwpmc.hashrows -- determines the number of rows in the * of the hash table used to look up threads */ static int pmc_hashsize = PMC_HASH_SIZE; SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_RDTUN, &pmc_hashsize, 0, "rows in hash tables"); /* * kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU */ static int pmc_nsamples = PMC_NSAMPLES; SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_RDTUN, &pmc_nsamples, 0, "number of PC samples per CPU"); /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. */ static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE; SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_RDTUN, &pmc_mtxpool_size, 0, "size of spin mutex pool"); /* * kern.hwpmc.threadfreelist_entries -- number of free entries */ SYSCTL_INT(_kern_hwpmc, OID_AUTO, threadfreelist_entries, CTLFLAG_RD, &pmc_threadfreelist_entries, 0, "number of avalable thread entries"); /* * kern.hwpmc.threadfreelist_max -- maximum number of free entries */ static int pmc_threadfreelist_max = PMC_THREADLIST_MAX; SYSCTL_INT(_kern_hwpmc, OID_AUTO, threadfreelist_max, CTLFLAG_RW, &pmc_threadfreelist_max, 0, "maximum number of available thread entries before freeing some"); /* * security.bsd.unprivileged_syspmcs -- allow non-root processes to * allocate system-wide PMCs. * * Allowing unprivileged processes to allocate system PMCs is convenient * if system-wide measurements need to be taken concurrently with other * per-process measurements. This feature is turned off by default. */ static int pmc_unprivileged_syspmcs = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RWTUN, &pmc_unprivileged_syspmcs, 0, "allow unprivileged process to allocate system PMCs"); /* * Hash function. Discard the lower 2 bits of the pointer since * these are always zero for our uses. The hash multiplier is * round((2^LONG_BIT) * ((sqrt(5)-1)/2)). */ #if LONG_BIT == 64 #define _PMC_HM 11400714819323198486u #elif LONG_BIT == 32 #define _PMC_HM 2654435769u #else #error Must know the size of 'long' to compile #endif #define PMC_HASH_PTR(P,M) ((((unsigned long) (P) >> 2) * _PMC_HM) & (M)) /* * Syscall structures */ /* The `sysent' for the new syscall */ static struct sysent pmc_sysent = { .sy_narg = 2, .sy_call = pmc_syscall_handler, }; static struct syscall_module_data pmc_syscall_mod = { .chainevh = load, .chainarg = NULL, .offset = &pmc_syscall_num, .new_sysent = &pmc_sysent, .old_sysent = { .sy_narg = 0, .sy_call = NULL }, .flags = SY_THR_STATIC_KLD, }; static moduledata_t pmc_mod = { .name = PMC_MODULE_NAME, .evhand = syscall_module_handler, .priv = &pmc_syscall_mod, }; #ifdef EARLY_AP_STARTUP DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SYSCALLS, SI_ORDER_ANY); #else DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY); #endif MODULE_VERSION(pmc, PMC_VERSION); #ifdef HWPMC_DEBUG enum pmc_dbgparse_state { PMCDS_WS, /* in whitespace */ PMCDS_MAJOR, /* seen a major keyword */ PMCDS_MINOR }; static int pmc_debugflags_parse(char *newstr, char *fence) { char c, *p, *q; struct pmc_debugflags *tmpflags; int error, found, *newbits, tmp; size_t kwlen; tmpflags = malloc(sizeof(*tmpflags), M_PMC, M_WAITOK|M_ZERO); p = newstr; error = 0; for (; p < fence && (c = *p); p++) { /* skip white space */ if (c == ' ' || c == '\t') continue; /* look for a keyword followed by "=" */ for (q = p; p < fence && (c = *p) && c != '='; p++) ; if (c != '=') { error = EINVAL; goto done; } kwlen = p - q; newbits = NULL; /* lookup flag group name */ #define DBG_SET_FLAG_MAJ(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ newbits = &tmpflags->pdb_ ## F; DBG_SET_FLAG_MAJ("cpu", CPU); DBG_SET_FLAG_MAJ("csw", CSW); DBG_SET_FLAG_MAJ("logging", LOG); DBG_SET_FLAG_MAJ("module", MOD); DBG_SET_FLAG_MAJ("md", MDP); DBG_SET_FLAG_MAJ("owner", OWN); DBG_SET_FLAG_MAJ("pmc", PMC); DBG_SET_FLAG_MAJ("process", PRC); DBG_SET_FLAG_MAJ("sampling", SAM); if (newbits == NULL) { error = EINVAL; goto done; } p++; /* skip the '=' */ /* Now parse the individual flags */ tmp = 0; newflag: for (q = p; p < fence && (c = *p); p++) if (c == ' ' || c == '\t' || c == ',') break; /* p == fence or c == ws or c == "," or c == 0 */ if ((kwlen = p - q) == 0) { *newbits = tmp; continue; } found = 0; #define DBG_SET_FLAG_MIN(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ tmp |= found = (1 << PMC_DEBUG_MIN_ ## F) /* a '*' denotes all possible flags in the group */ if (kwlen == 1 && *q == '*') tmp = found = ~0; /* look for individual flag names */ DBG_SET_FLAG_MIN("allocaterow", ALR); DBG_SET_FLAG_MIN("allocate", ALL); DBG_SET_FLAG_MIN("attach", ATT); DBG_SET_FLAG_MIN("bind", BND); DBG_SET_FLAG_MIN("config", CFG); DBG_SET_FLAG_MIN("exec", EXC); DBG_SET_FLAG_MIN("exit", EXT); DBG_SET_FLAG_MIN("find", FND); DBG_SET_FLAG_MIN("flush", FLS); DBG_SET_FLAG_MIN("fork", FRK); DBG_SET_FLAG_MIN("getbuf", GTB); DBG_SET_FLAG_MIN("hook", PMH); DBG_SET_FLAG_MIN("init", INI); DBG_SET_FLAG_MIN("intr", INT); DBG_SET_FLAG_MIN("linktarget", TLK); DBG_SET_FLAG_MIN("mayberemove", OMR); DBG_SET_FLAG_MIN("ops", OPS); DBG_SET_FLAG_MIN("read", REA); DBG_SET_FLAG_MIN("register", REG); DBG_SET_FLAG_MIN("release", REL); DBG_SET_FLAG_MIN("remove", ORM); DBG_SET_FLAG_MIN("sample", SAM); DBG_SET_FLAG_MIN("scheduleio", SIO); DBG_SET_FLAG_MIN("select", SEL); DBG_SET_FLAG_MIN("signal", SIG); DBG_SET_FLAG_MIN("swi", SWI); DBG_SET_FLAG_MIN("swo", SWO); DBG_SET_FLAG_MIN("start", STA); DBG_SET_FLAG_MIN("stop", STO); DBG_SET_FLAG_MIN("syscall", PMS); DBG_SET_FLAG_MIN("unlinktarget", TUL); DBG_SET_FLAG_MIN("write", WRI); if (found == 0) { /* unrecognized flag name */ error = EINVAL; goto done; } if (c == 0 || c == ' ' || c == '\t') { /* end of flag group */ *newbits = tmp; continue; } p++; goto newflag; } /* save the new flag set */ bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags)); done: free(tmpflags, M_PMC); return error; } static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS) { char *fence, *newstr; int error; unsigned int n; (void) arg1; (void) arg2; /* unused parameters */ n = sizeof(pmc_debugstr); newstr = malloc(n, M_PMC, M_WAITOK|M_ZERO); (void) strlcpy(newstr, pmc_debugstr, n); error = sysctl_handle_string(oidp, newstr, n, req); /* if there is a new string, parse and copy it */ if (error == 0 && req->newptr != NULL) { fence = newstr + (n < req->newlen ? n : req->newlen + 1); if ((error = pmc_debugflags_parse(newstr, fence)) == 0) (void) strlcpy(pmc_debugstr, newstr, sizeof(pmc_debugstr)); } free(newstr, M_PMC); return error; } #endif /* * Map a row index to a classdep structure and return the adjusted row * index for the PMC class index. */ static struct pmc_classdep * pmc_ri_to_classdep(struct pmc_mdep *md, int ri, int *adjri) { struct pmc_classdep *pcd; (void) md; KASSERT(ri >= 0 && ri < md->pmd_npmc, ("[pmc,%d] illegal row-index %d", __LINE__, ri)); pcd = pmc_rowindex_to_classdep[ri]; KASSERT(pcd != NULL, ("[pmc,%d] ri %d null pcd", __LINE__, ri)); *adjri = ri - pcd->pcd_ri; KASSERT(*adjri >= 0 && *adjri < pcd->pcd_num, ("[pmc,%d] adjusted row-index %d", __LINE__, *adjri)); return (pcd); } /* * Concurrency Control * * The driver manages the following data structures: * * - target process descriptors, one per target process * - owner process descriptors (and attached lists), one per owner process * - lookup hash tables for owner and target processes * - PMC descriptors (and attached lists) * - per-cpu hardware state * - the 'hook' variable through which the kernel calls into * this module * - the machine hardware state (managed by the MD layer) * * These data structures are accessed from: * * - thread context-switch code * - interrupt handlers (possibly on multiple cpus) * - kernel threads on multiple cpus running on behalf of user * processes doing system calls * - this driver's private kernel threads * * = Locks and Locking strategy = * * The driver uses four locking strategies for its operation: * * - The global SX lock "pmc_sx" is used to protect internal * data structures. * * Calls into the module by syscall() start with this lock being * held in exclusive mode. Depending on the requested operation, * the lock may be downgraded to 'shared' mode to allow more * concurrent readers into the module. Calls into the module from * other parts of the kernel acquire the lock in shared mode. * * This SX lock is held in exclusive mode for any operations that * modify the linkages between the driver's internal data structures. * * The 'pmc_hook' function pointer is also protected by this lock. * It is only examined with the sx lock held in exclusive mode. The * kernel module is allowed to be unloaded only with the sx lock held * in exclusive mode. In normal syscall handling, after acquiring the * pmc_sx lock we first check that 'pmc_hook' is non-null before * proceeding. This prevents races between the thread unloading the module * and other threads seeking to use the module. * * - Lookups of target process structures and owner process structures * cannot use the global "pmc_sx" SX lock because these lookups need * to happen during context switches and in other critical sections * where sleeping is not allowed. We protect these lookup tables * with their own private spin-mutexes, "pmc_processhash_mtx" and * "pmc_ownerhash_mtx". * * - Interrupt handlers work in a lock free manner. At interrupt * time, handlers look at the PMC pointer (phw->phw_pmc) configured * when the PMC was started. If this pointer is NULL, the interrupt * is ignored after updating driver statistics. We ensure that this * pointer is set (using an atomic operation if necessary) before the * PMC hardware is started. Conversely, this pointer is unset atomically * only after the PMC hardware is stopped. * * We ensure that everything needed for the operation of an * interrupt handler is available without it needing to acquire any * locks. We also ensure that a PMC's software state is destroyed only * after the PMC is taken off hardware (on all CPUs). * * - Context-switch handling with process-private PMCs needs more * care. * * A given process may be the target of multiple PMCs. For example, * PMCATTACH and PMCDETACH may be requested by a process on one CPU * while the target process is running on another. A PMC could also * be getting released because its owner is exiting. We tackle * these situations in the following manner: * * - each target process structure 'pmc_process' has an array * of 'struct pmc *' pointers, one for each hardware PMC. * * - At context switch IN time, each "target" PMC in RUNNING state * gets started on hardware and a pointer to each PMC is copied into * the per-cpu phw array. The 'runcount' for the PMC is * incremented. * * - At context switch OUT time, all process-virtual PMCs are stopped * on hardware. The saved value is added to the PMCs value field * only if the PMC is in a non-deleted state (the PMCs state could * have changed during the current time slice). * * Note that since in-between a switch IN on a processor and a switch * OUT, the PMC could have been released on another CPU. Therefore * context switch OUT always looks at the hardware state to turn * OFF PMCs and will update a PMC's saved value only if reachable * from the target process record. * * - OP PMCRELEASE could be called on a PMC at any time (the PMC could * be attached to many processes at the time of the call and could * be active on multiple CPUs). * * We prevent further scheduling of the PMC by marking it as in * state 'DELETED'. If the runcount of the PMC is non-zero then * this PMC is currently running on a CPU somewhere. The thread * doing the PMCRELEASE operation waits by repeatedly doing a * pause() till the runcount comes to zero. * * The contents of a PMC descriptor (struct pmc) are protected using * a spin-mutex. In order to save space, we use a mutex pool. * * In terms of lock types used by witness(4), we use: * - Type "pmc-sx", used by the global SX lock. * - Type "pmc-sleep", for sleep mutexes used by logger threads. * - Type "pmc-per-proc", for protecting PMC owner descriptors. * - Type "pmc-leaf", used for all other spin mutexes. */ /* * save the cpu binding of the current kthread */ static void pmc_save_cpu_binding(struct pmc_binding *pb) { PMCDBG0(CPU,BND,2, "save-cpu"); thread_lock(curthread); pb->pb_bound = sched_is_bound(curthread); pb->pb_cpu = curthread->td_oncpu; thread_unlock(curthread); PMCDBG1(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu); } /* * restore the cpu binding of the current thread */ static void pmc_restore_cpu_binding(struct pmc_binding *pb) { PMCDBG2(CPU,BND,2, "restore-cpu curcpu=%d restore=%d", curthread->td_oncpu, pb->pb_cpu); thread_lock(curthread); if (pb->pb_bound) sched_bind(curthread, pb->pb_cpu); else sched_unbind(curthread); thread_unlock(curthread); PMCDBG0(CPU,BND,2, "restore-cpu done"); } /* * move execution over the specified cpu and bind it there. */ static void pmc_select_cpu(int cpu) { KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] bad cpu number %d", __LINE__, cpu)); /* Never move to an inactive CPU. */ KASSERT(pmc_cpu_is_active(cpu), ("[pmc,%d] selecting inactive " "CPU %d", __LINE__, cpu)); PMCDBG1(CPU,SEL,2, "select-cpu cpu=%d", cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); KASSERT(curthread->td_oncpu == cpu, ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__, cpu, curthread->td_oncpu)); PMCDBG1(CPU,SEL,2, "select-cpu cpu=%d ok", cpu); } /* * Force a context switch. * * We do this by pause'ing for 1 tick -- invoking mi_switch() is not * guaranteed to force a context switch. */ static void pmc_force_context_switch(void) { pause("pmcctx", 1); } uint64_t pmc_rdtsc(void) { #if defined(__i386__) || defined(__amd64__) if (__predict_true(amd_feature & AMDID_RDTSCP)) return rdtscp(); else return rdtsc(); #else return get_cyclecount(); #endif } /* * Get the file name for an executable. This is a simple wrapper * around vn_fullpath(9). */ static void pmc_getfilename(struct vnode *v, char **fullpath, char **freepath) { *fullpath = "unknown"; *freepath = NULL; vn_fullpath(curthread, v, fullpath, freepath); } /* * remove an process owning PMCs */ void pmc_remove_owner(struct pmc_owner *po) { struct pmc *pm, *tmp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG1(OWN,ORM,1, "remove-owner po=%p", po); /* Remove descriptor from the owner hash table */ LIST_REMOVE(po, po_next); /* release all owned PMC descriptors */ LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) { PMCDBG1(OWN,ORM,2, "pmc=%p", pm); KASSERT(pm->pm_owner == po, ("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po)); pmc_release_pmc_descriptor(pm); /* will unlink from the list */ pmc_destroy_pmc_descriptor(pm); } KASSERT(po->po_sscount == 0, ("[pmc,%d] SS count not zero", __LINE__)); KASSERT(LIST_EMPTY(&po->po_pmcs), ("[pmc,%d] PMC list not empty", __LINE__)); /* de-configure the log file if present */ if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_deconfigure_log(po); } /* * remove an owner process record if all conditions are met. */ static void pmc_maybe_remove_owner(struct pmc_owner *po) { PMCDBG1(OWN,OMR,1, "maybe-remove-owner po=%p", po); /* * Remove owner record if * - this process does not own any PMCs * - this process has not allocated a system-wide sampling buffer */ if (LIST_EMPTY(&po->po_pmcs) && ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } } /* * Add an association between a target process and a PMC. */ static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct pmc_target *pt; #ifdef INVARIANTS struct pmc_thread *pt_td; #endif sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d", __LINE__, pm, pp->pp_proc->p_pid)); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= ((int) md->pmd_npmc - 1), ("[pmc,%d] Illegal reference count %d for process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG3(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); #ifdef HWPMC_DEBUG LIST_FOREACH(pt, &pm->pm_targets, pt_next) if (pt->pt_process == pp) KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets", __LINE__, pp, pm)); #endif pt = malloc(sizeof(struct pmc_target), M_PMC, M_WAITOK|M_ZERO); pt->pt_process = pp; LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next); atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc, (uintptr_t)pm); if (pm->pm_owner->po_owner == pp->pp_proc) pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER; /* * Initialize the per-process values at this row index. */ pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ? pm->pm_sc.pm_reloadcount : 0; pp->pp_refcnt++; #ifdef INVARIANTS /* Confirm that the per-thread values at this row index are cleared. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt_td, &pp->pp_tds, pt_next) { KASSERT(pt_td->pt_pmcs[ri].pt_pmcval == (pmc_value_t) 0, ("[pmc,%d] pt_pmcval not cleared for pid=%d at " "ri=%d", __LINE__, pp->pp_proc->p_pid, ri)); } mtx_unlock_spin(pp->pp_tdslock); } #endif } /* * Removes the association between a target process and a PMC. */ static void pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct proc *p; struct pmc_target *ptgt; struct pmc_thread *pt; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG3(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); KASSERT(pp->pp_pmcs[ri].pp_pmc == pm, ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__, ri, pm, pp->pp_pmcs[ri].pp_pmc)); pp->pp_pmcs[ri].pp_pmc = NULL; pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0; /* Clear the per-thread values at this row index. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt, &pp->pp_tds, pt_next) pt->pt_pmcs[ri].pt_pmcval = (pmc_value_t) 0; mtx_unlock_spin(pp->pp_tdslock); } /* Remove owner-specific flags */ if (pm->pm_owner->po_owner == pp->pp_proc) { pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS; pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER; } pp->pp_refcnt--; /* Remove the target process from the PMC structure */ LIST_FOREACH(ptgt, &pm->pm_targets, pt_next) if (ptgt->pt_process == pp) break; KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found " "in pmc %p", __LINE__, pp->pp_proc, pp, pm)); LIST_REMOVE(ptgt, pt_next); free(ptgt, M_PMC); /* if the PMC now lacks targets, send the owner a SIGIO */ if (LIST_EMPTY(&pm->pm_targets)) { p = pm->pm_owner->po_owner; PROC_LOCK(p); kern_psignal(p, SIGIO); PROC_UNLOCK(p); PMCDBG2(PRC,SIG,2, "signalling proc=%p signal=%d", p, SIGIO); } } /* * Check if PMC 'pm' may be attached to target process 't'. */ static int pmc_can_attach(struct pmc *pm, struct proc *t) { struct proc *o; /* pmc owner */ struct ucred *oc, *tc; /* owner, target credentials */ int decline_attach, i; /* * A PMC's owner can always attach that PMC to itself. */ if ((o = pm->pm_owner->po_owner) == t) return 0; PROC_LOCK(o); oc = o->p_ucred; crhold(oc); PROC_UNLOCK(o); PROC_LOCK(t); tc = t->p_ucred; crhold(tc); PROC_UNLOCK(t); /* * The effective uid of the PMC owner should match at least one * of the {effective,real,saved} uids of the target process. */ decline_attach = oc->cr_uid != tc->cr_uid && oc->cr_uid != tc->cr_svuid && oc->cr_uid != tc->cr_ruid; /* * Every one of the target's group ids, must be in the owner's * group list. */ for (i = 0; !decline_attach && i < tc->cr_ngroups; i++) decline_attach = !groupmember(tc->cr_groups[i], oc); /* check the read and saved gids too */ if (decline_attach == 0) decline_attach = !groupmember(tc->cr_rgid, oc) || !groupmember(tc->cr_svgid, oc); crfree(tc); crfree(oc); return !decline_attach; } /* * Attach a process to a PMC. */ static int pmc_attach_one_process(struct proc *p, struct pmc *pm) { int ri, error; char *fullpath, *freepath; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * Locate the process descriptor corresponding to process 'p', * allocating space as needed. * * Verify that rowindex 'pm_rowindex' is free in the process * descriptor. * * If not, allocate space for a descriptor and link the * process descriptor and PMC. */ ri = PMC_TO_ROWINDEX(pm); /* mark process as using HWPMCs */ PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL) { error = ENOMEM; goto fail; } if (pp->pp_pmcs[ri].pp_pmc == pm) {/* already present at slot [ri] */ error = EEXIST; goto fail; } if (pp->pp_pmcs[ri].pp_pmc != NULL) { error = EBUSY; goto fail; } pmc_link_target_process(pm, pp); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) && (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0) pm->pm_flags |= PMC_F_NEEDS_LOGFILE; pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */ /* issue an attach event to a configured log file */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) { if (p->p_flag & P_KPROC) { fullpath = kernelname; freepath = NULL; } else { pmc_getfilename(p->p_textvp, &fullpath, &freepath); pmclog_process_pmcattach(pm, p->p_pid, fullpath); } free(freepath, M_TEMP); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_process_mappings(pm->pm_owner, p); } return (0); fail: PROC_LOCK(p); p->p_flag &= ~P_HWPMC; PROC_UNLOCK(p); return (error); } /* * Attach a process and optionally its children */ static int pmc_attach_process(struct proc *p, struct pmc *pm) { int error; struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * If this PMC successfully allowed a GETMSR operation * in the past, disallow further ATTACHes. */ if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0) return EPERM; if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_attach_one_process(p, pm); /* * Traverse all child processes, attaching them to * this PMC. */ sx_slock(&proctree_lock); top = p; for (;;) { if ((error = pmc_attach_one_process(p, pm)) != 0) break; if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } if (error) (void) pmc_detach_process(top, pm); done: sx_sunlock(&proctree_lock); return error; } /* * Detach a process from a PMC. If there are no other PMCs tracking * this process, remove the process structure from its hash table. If * 'flags' contains PMC_FLAG_REMOVE, then free the process structure. */ static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags) { int ri; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL, ("[pmc,%d] null pm pointer", __LINE__)); ri = PMC_TO_ROWINDEX(pm); PMCDBG6(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x", pm, ri, p, p->p_pid, p->p_comm, flags); if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) return ESRCH; if (pp->pp_pmcs[ri].pp_pmc != pm) return EINVAL; pmc_unlink_target_process(pm, pp); /* Issue a detach entry if a log file is configured */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcdetach(pm, p->p_pid); /* * If there are no PMCs targeting this process, we remove its * descriptor from the target hash table and unset the P_HWPMC * flag in the struct proc. */ KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal refcnt %d for process struct %p", __LINE__, pp->pp_refcnt, pp)); if (pp->pp_refcnt != 0) /* still a target of some PMC */ return 0; pmc_remove_process_descriptor(pp); if (flags & PMC_FLAG_REMOVE) pmc_destroy_process_descriptor(pp); PROC_LOCK(p); p->p_flag &= ~P_HWPMC; PROC_UNLOCK(p); return 0; } /* * Detach a process and optionally its descendants from a PMC. */ static int pmc_detach_process(struct proc *p, struct pmc *pm) { struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); /* * Traverse all children, detaching them from this PMC. We * ignore errors since we could be detaching a PMC from a * partially attached proc tree. */ sx_slock(&proctree_lock); top = p; for (;;) { (void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); if (LIST_EMPTY(&pm->pm_targets)) pm->pm_flags &= ~PMC_F_ATTACH_DONE; return 0; } /* * Thread context switch IN */ static void pmc_process_csw_in(struct thread *td) { int cpu; unsigned int adjri, ri; struct pmc *pm; struct proc *p; struct pmc_cpu *pc; struct pmc_hw *phw; pmc_value_t newvalue; struct pmc_process *pp; struct pmc_thread *pt; struct pmc_classdep *pcd; p = td->td_proc; pt = NULL; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL) return; KASSERT(pp->pp_proc == td->td_proc, ("[pmc,%d] not my thread state", __LINE__)); critical_enter(); /* no preemption from this point */ cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG5(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] weird CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; for (ri = 0; ri < md->pmd_npmc; ri++) { if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL) continue; KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Target PMC in non-virtual mode (%d)", __LINE__, PMC_TO_MODE(pm))); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] Row index mismatch pmc %d != ri %d", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* * Only PMCs that are marked as 'RUNNING' need * be placed on hardware. */ if (pm->pm_state != PMC_STATE_RUNNING) continue; /* increment PMC runcount */ counter_u64_add(pm->pm_runcount, 1); /* configure the HWPMC we are going to use. */ pcd = pmc_ri_to_classdep(md, ri, &adjri); pcd->pcd_config_pmc(cpu, adjri, pm); phw = pc->pc_hwpmcs[ri]; KASSERT(phw != NULL, ("[pmc,%d] null hw pointer", __LINE__)); KASSERT(phw->phw_pmc == pm, ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__, phw->phw_pmc, pm)); /* * Write out saved value and start the PMC. * * Sampling PMCs use a per-thread value, while * counting mode PMCs use a per-pmc value that is * inherited across descendants. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { if (pt == NULL) pt = pmc_find_thread_descriptor(pp, td, PMC_FLAG_NONE); KASSERT(pt != NULL, ("[pmc,%d] No thread found for td=%p", __LINE__, td)); mtx_pool_lock_spin(pmc_mtxpool, pm); /* * If we have a thread descriptor, use the per-thread * counter in the descriptor. If not, we will use * a per-process counter. * * TODO: Remove the per-process "safety net" once * we have thoroughly tested that we don't hit the * above assert. */ if (pt != NULL) { if (pt->pt_pmcs[ri].pt_pmcval > 0) newvalue = pt->pt_pmcs[ri].pt_pmcval; else newvalue = pm->pm_sc.pm_reloadcount; } else { /* * Use the saved value calculated after the most * recent time a thread using the shared counter * switched out. Reset the saved count in case * another thread from this process switches in * before any threads switch out. */ newvalue = pp->pp_pmcs[ri].pp_pmcval; pp->pp_pmcs[ri].pp_pmcval = pm->pm_sc.pm_reloadcount; } mtx_pool_unlock_spin(pmc_mtxpool, pm); KASSERT(newvalue > 0 && newvalue <= pm->pm_sc.pm_reloadcount, ("[pmc,%d] pmcval outside of expected range cpu=%d " "ri=%d pmcval=%jx pm_reloadcount=%jx", __LINE__, cpu, ri, newvalue, pm->pm_sc.pm_reloadcount)); } else { KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC, ("[pmc,%d] illegal mode=%d", __LINE__, PMC_TO_MODE(pm))); mtx_pool_lock_spin(pmc_mtxpool, pm); newvalue = PMC_PCPU_SAVED(cpu, ri) = pm->pm_gv.pm_savedvalue; mtx_pool_unlock_spin(pmc_mtxpool, pm); } PMCDBG3(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue); pcd->pcd_write_pmc(cpu, adjri, newvalue); /* If a sampling mode PMC, reset stalled state. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) pm->pm_pcpu_state[cpu].pps_stalled = 0; /* Indicate that we desire this to run. */ pm->pm_pcpu_state[cpu].pps_cpustate = 1; /* Start the PMC. */ pcd->pcd_start_pmc(cpu, adjri); } /* * perform any other architecture/cpu dependent thread * switch-in actions. */ (void) (*md->pmd_switch_in)(pc, pp); critical_exit(); } /* * Thread context switch OUT. */ static void pmc_process_csw_out(struct thread *td) { int cpu; int64_t tmp; struct pmc *pm; struct proc *p; enum pmc_mode mode; struct pmc_cpu *pc; pmc_value_t newvalue; unsigned int adjri, ri; struct pmc_process *pp; struct pmc_thread *pt = NULL; struct pmc_classdep *pcd; /* * Locate our process descriptor; this may be NULL if * this process is exiting and we have already removed * the process from the target process table. * * Note that due to kernel preemption, multiple * context switches may happen while the process is * exiting. * * Note also that if the target process cannot be * found we still need to deconfigure any PMCs that * are currently running on hardware. */ p = td->td_proc; pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE); /* * save PMCs */ critical_enter(); cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG5(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d weird CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; /* * When a PMC gets unlinked from a target PMC, it will * be removed from the target's pp_pmc[] array. * * However, on a MP system, the target could have been * executing on another CPU at the time of the unlink. * So, at context switch OUT time, we need to look at * the hardware to determine if a PMC is scheduled on * it. */ for (ri = 0; ri < md->pmd_npmc; ri++) { pcd = pmc_ri_to_classdep(md, ri, &adjri); pm = NULL; (void) (*pcd->pcd_get_config)(cpu, adjri, &pm); if (pm == NULL) /* nothing at this row index */ continue; mode = PMC_TO_MODE(pm); if (!PMC_IS_VIRTUAL_MODE(mode)) continue; /* not a process virtual PMC */ KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* * Change desired state, and then stop if not stalled. * This two-step dance should avoid race conditions where * an interrupt re-enables the PMC after this code has * already checked the pm_stalled flag. */ pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (pm->pm_pcpu_state[cpu].pps_stalled == 0) pcd->pcd_stop_pmc(cpu, adjri); /* reduce this PMC's runcount */ counter_u64_add(pm->pm_runcount, -1); /* * If this PMC is associated with this process, * save the reading. */ if (pm->pm_state != PMC_STATE_DELETED && pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) { KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); KASSERT(pp->pp_refcnt > 0, ("[pmc,%d] pp refcnt = %d", __LINE__, pp->pp_refcnt)); pcd->pcd_read_pmc(cpu, adjri, &newvalue); if (mode == PMC_MODE_TS) { PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d val=%jd (samp)", cpu, ri, newvalue); if (pt == NULL) pt = pmc_find_thread_descriptor(pp, td, PMC_FLAG_NONE); KASSERT(pt != NULL, ("[pmc,%d] No thread found for td=%p", __LINE__, td)); mtx_pool_lock_spin(pmc_mtxpool, pm); /* * If we have a thread descriptor, save the * per-thread counter in the descriptor. If not, * we will update the per-process counter. * * TODO: Remove the per-process "safety net" * once we have thoroughly tested that we * don't hit the above assert. */ if (pt != NULL) pt->pt_pmcs[ri].pt_pmcval = newvalue; else { /* * For sampling process-virtual PMCs, * newvalue is the number of events to * be seen until the next sampling * interrupt. We can just add the events * left from this invocation to the * counter, then adjust in case we * overflow our range. * * (Recall that we reload the counter * every time we use it.) */ pp->pp_pmcs[ri].pp_pmcval += newvalue; if (pp->pp_pmcs[ri].pp_pmcval > pm->pm_sc.pm_reloadcount) pp->pp_pmcs[ri].pp_pmcval -= pm->pm_sc.pm_reloadcount; } mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd (count)", cpu, ri, tmp); /* * For counting process-virtual PMCs, * we expect the count to be * increasing monotonically, modulo a 64 * bit wraparound. */ KASSERT(tmp >= 0, ("[pmc,%d] negative increment cpu=%d " "ri=%d newvalue=%jx saved=%jx " "incr=%jx", __LINE__, cpu, ri, newvalue, PMC_PCPU_SAVED(cpu,ri), tmp)); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin(pmc_mtxpool, pm); if (pm->pm_flags & PMC_F_LOG_PROCCSW) pmclog_process_proccsw(pm, pp, tmp, td); } } /* mark hardware as free */ pcd->pcd_config_pmc(cpu, adjri, NULL); } /* * perform any other architecture/cpu dependent thread * switch out functions. */ (void) (*md->pmd_switch_out)(pc, pp); critical_exit(); } /* * A new thread for a process. */ static void pmc_process_thread_add(struct thread *td) { struct pmc_process *pmc; pmc = pmc_find_process_descriptor(td->td_proc, PMC_FLAG_NONE); if (pmc != NULL) pmc_find_thread_descriptor(pmc, td, PMC_FLAG_ALLOCATE); } /* * A thread delete for a process. */ static void pmc_process_thread_delete(struct thread *td) { struct pmc_process *pmc; pmc = pmc_find_process_descriptor(td->td_proc, PMC_FLAG_NONE); if (pmc != NULL) pmc_thread_descriptor_pool_free(pmc_find_thread_descriptor(pmc, td, PMC_FLAG_REMOVE)); } /* * A userret() call for a thread. */ static void pmc_process_thread_userret(struct thread *td) { sched_pin(); pmc_capture_user_callchain(curcpu, PMC_UR, td->td_frame); sched_unpin(); } /* * A mapping change for a process. */ static void pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm) { int ri; pid_t pid; char *fullpath, *freepath; const struct pmc *pm; struct pmc_owner *po; const struct pmc_process *pp; freepath = fullpath = NULL; - MPASS(!in_epoch()); + MPASS(!in_epoch(global_epoch_preempt)); pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath); pid = td->td_proc->p_pid; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); /* Inform owners of all system-wide sampling PMCs. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, pid, pkm->pm_address, fullpath); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) goto done; /* * Inform sampling PMC owners tracking this process. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_in(pm->pm_owner, pid, pkm->pm_address, fullpath); done: if (freepath) free(freepath, M_TEMP); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } /* * Log an munmap request. */ static void pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm) { int ri; pid_t pid; struct pmc_owner *po; const struct pmc *pm; const struct pmc_process *pp; pid = td->td_proc->p_pid; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) return; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_out(pm->pm_owner, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); } /* * Log mapping information about the kernel. */ static void pmc_log_kernel_mappings(struct pmc *pm) { struct pmc_owner *po; struct pmckern_map_in *km, *kmbase; - MPASS(in_epoch() || sx_xlocked(&pmc_sx)); + MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] non-sampling PMC (%p) desires mapping information", __LINE__, (void *) pm)); po = pm->pm_owner; if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE) return; if (PMC_TO_MODE(pm) == PMC_MODE_SS) pmc_process_allproc(pm); /* * Log the current set of kernel modules. */ kmbase = linker_hwpmc_list_objects(); for (km = kmbase; km->pm_file != NULL; km++) { PMCDBG2(LOG,REG,1,"%s %p", (char *) km->pm_file, (void *) km->pm_address); pmclog_process_map_in(po, (pid_t) -1, km->pm_address, km->pm_file); } free(kmbase, M_LINKER); po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE; } /* * Log the mappings for a single process. */ static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p) { vm_map_t map; struct vnode *vp; struct vmspace *vm; vm_map_entry_t entry; vm_offset_t last_end; u_int last_timestamp; struct vnode *last_vp; vm_offset_t start_addr; vm_object_t obj, lobj, tobj; char *fullpath, *freepath; last_vp = NULL; last_end = (vm_offset_t) 0; fullpath = freepath = NULL; if ((vm = vmspace_acquire_ref(p)) == NULL) return; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry == NULL) { PMCDBG2(LOG,OPS,2, "hwpmc: vm_map entry unexpectedly " "NULL! pid=%d vm_map=%p\n", p->p_pid, map); break; } /* * We only care about executable map entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) || !(entry->protection & VM_PROT_EXECUTE) || (entry->object.vm_object == NULL)) { continue; } obj = entry->object.vm_object; VM_OBJECT_RLOCK(obj); /* * Walk the backing_object list to find the base * (non-shadowed) vm_object. */ for (lobj = tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } /* * At this point lobj is the base vm_object and it is locked. */ if (lobj == NULL) { PMCDBG3(LOG,OPS,2, "hwpmc: lobj unexpectedly NULL! pid=%d " "vm_map=%p vm_obj=%p\n", p->p_pid, map, obj); VM_OBJECT_RUNLOCK(obj); continue; } vp = vm_object_vnode(lobj); if (vp == NULL) { if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); continue; } /* * Skip contiguous regions that point to the same * vnode, so we don't emit redundant MAP-IN * directives. */ if (entry->start == last_end && vp == last_vp) { last_end = entry->end; if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); continue; } /* * We don't want to keep the proc's vm_map or this * vm_object locked while we walk the pathname, since * vn_fullpath() can sleep. However, if we drop the * lock, it's possible for concurrent activity to * modify the vm_map list. To protect against this, * we save the vm_map timestamp before we release the * lock, and check it after we reacquire the lock * below. */ start_addr = entry->start; last_end = entry->end; last_timestamp = map->timestamp; vm_map_unlock_read(map); vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); freepath = NULL; pmc_getfilename(vp, &fullpath, &freepath); last_vp = vp; vrele(vp); vp = NULL; pmclog_process_map_in(po, p->p_pid, start_addr, fullpath); if (freepath) free(freepath, M_TEMP); vm_map_lock_read(map); /* * If our saved timestamp doesn't match, this means * that the vm_map was modified out from under us and * we can't trust our current "entry" pointer. Do a * new lookup for this entry. If there is no entry * for this address range, vm_map_lookup_entry() will * return the previous one, so we always want to go to * entry->next on the next loop iteration. * * There is an edge condition here that can occur if * there is no entry at or before this address. In * this situation, vm_map_lookup_entry returns * &map->header, which would cause our loop to abort * without processing the rest of the map. However, * in practice this will never happen for process * vm_map. This is because the executable's text * segment is the first mapping in the proc's address * space, and this mapping is never removed until the * process exits, so there will always be a non-header * entry at or before the requested address for * vm_map_lookup_entry to return. */ if (map->timestamp != last_timestamp) vm_map_lookup_entry(map, last_end - 1, &entry); } vm_map_unlock_read(map); vmspace_free(vm); return; } /* * Log mappings for all processes in the system. */ static void pmc_log_all_process_mappings(struct pmc_owner *po) { struct proc *p, *top; sx_assert(&pmc_sx, SX_XLOCKED); if ((p = pfind(1)) == NULL) panic("[pmc,%d] Cannot find init", __LINE__); PROC_UNLOCK(p); sx_slock(&proctree_lock); top = p; for (;;) { pmc_log_process_mappings(po, p); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); } /* * The 'hook' invoked from the kernel proper */ #ifdef HWPMC_DEBUG const char *pmc_hooknames[] = { /* these strings correspond to PMC_FN_* in */ "", "EXEC", "CSW-IN", "CSW-OUT", "SAMPLE", "UNUSED1", "UNUSED2", "MMAP", "MUNMAP", "CALLCHAIN-NMI", "CALLCHAIN-SOFT", "SOFTSAMPLING", "THR-CREATE", "THR-EXIT", "THR-USERRET", "THR-CREATE-LOG", "THR-EXIT-LOG", "PROC-CREATE-LOG" }; #endif static int pmc_hook_handler(struct thread *td, int function, void *arg) { int cpu; PMCDBG4(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function, pmc_hooknames[function], arg); switch (function) { /* * Process exec() */ case PMC_FN_PROCESS_EXEC: { char *fullpath, *freepath; unsigned int ri; int is_using_hwpmcs; struct pmc *pm; struct proc *p; struct pmc_owner *po; struct pmc_process *pp; struct pmckern_procexec *pk; sx_assert(&pmc_sx, SX_XLOCKED); p = td->td_proc; pmc_getfilename(p->p_textvp, &fullpath, &freepath); pk = (struct pmckern_procexec *) arg; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); /* Inform owners of SS mode PMCs of the exec event. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, PMC_ID_INVALID, p->p_pid, pk->pm_entryaddr, fullpath); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); if (!is_using_hwpmcs) { if (freepath) free(freepath, M_TEMP); break; } /* * PMCs are not inherited across an exec(): remove any * PMCs that this process is the owner of. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } /* * If the process being exec'ed is not the target of any * PMC, we are done. */ if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) { if (freepath) free(freepath, M_TEMP); break; } /* * Log the exec event to all monitoring owners. Skip * owners who have already received the event because * they had system sampling PMCs active. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, pm->pm_id, p->p_pid, pk->pm_entryaddr, fullpath); } if (freepath) free(freepath, M_TEMP); PMCDBG4(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d", p, p->p_pid, p->p_comm, pk->pm_credentialschanged); if (pk->pm_credentialschanged == 0) /* no change */ break; /* * If the newly exec()'ed process has a different credential * than before, allow it to be the target of a PMC only if * the PMC's owner has sufficient privilege. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) if (pmc_can_attach(pm, td->td_proc) != 0) pmc_detach_one_process(td->td_proc, pm, PMC_FLAG_NONE); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__, pp->pp_refcnt, pp)); /* * If this process is no longer the target of any * PMCs, we can remove the process entry and free * up space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); pmc_destroy_process_descriptor(pp); break; } } break; case PMC_FN_CSW_IN: pmc_process_csw_in(td); break; case PMC_FN_CSW_OUT: pmc_process_csw_out(td); break; /* * Process accumulated PC samples. * * This function is expected to be called by hardclock() for * each CPU that has accumulated PC samples. * * This function is to be executed on the CPU whose samples * are being processed. */ case PMC_FN_DO_SAMPLES: /* * Clear the cpu specific bit in the CPU mask before * do the rest of the processing. If the NMI handler * gets invoked after the "atomic_clear_int()" call * below but before "pmc_process_samples()" gets * around to processing the interrupt, then we will * come back here at the next hardclock() tick (and * may find nothing to do if "pmc_process_samples()" * had already processed the interrupt). We don't * lose the interrupt sample. */ DPCPU_SET(pmc_sampled, 0); cpu = PCPU_GET(cpuid); pmc_process_samples(cpu, PMC_HR); pmc_process_samples(cpu, PMC_SR); pmc_process_samples(cpu, PMC_UR); break; case PMC_FN_MMAP: pmc_process_mmap(td, (struct pmckern_map_in *) arg); break; case PMC_FN_MUNMAP: - MPASS(in_epoch() || sx_xlocked(&pmc_sx)); + MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); pmc_process_munmap(td, (struct pmckern_map_out *) arg); break; case PMC_FN_PROC_CREATE_LOG: pmc_process_proccreate((struct proc *)arg); break; case PMC_FN_USER_CALLCHAIN: /* * Record a call chain. */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_HR, (struct trapframe *) arg); KASSERT(td->td_pinned == 1, ("[pmc,%d] invalid td_pinned value", __LINE__)); sched_unpin(); /* Can migrate safely now. */ td->td_pflags &= ~TDP_CALLCHAIN; break; case PMC_FN_USER_CALLCHAIN_SOFT: /* * Record a call chain. */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); cpu = PCPU_GET(cpuid); pmc_capture_user_callchain(cpu, PMC_SR, (struct trapframe *) arg); KASSERT(td->td_pinned == 1, ("[pmc,%d] invalid td_pinned value", __LINE__)); sched_unpin(); /* Can migrate safely now. */ td->td_pflags &= ~TDP_CALLCHAIN; break; case PMC_FN_SOFT_SAMPLING: /* * Call soft PMC sampling intr. */ pmc_soft_intr((struct pmckern_soft *) arg); break; case PMC_FN_THR_CREATE: pmc_process_thread_add(td); pmc_process_threadcreate(td); break; case PMC_FN_THR_CREATE_LOG: pmc_process_threadcreate(td); break; case PMC_FN_THR_EXIT: KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_process_thread_delete(td); pmc_process_threadexit(td); break; case PMC_FN_THR_EXIT_LOG: pmc_process_threadexit(td); break; case PMC_FN_THR_USERRET: KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_process_thread_userret(td); break; default: #ifdef HWPMC_DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); #endif break; } return 0; } /* * allocate a 'struct pmc_owner' descriptor in the owner hash table. */ static struct pmc_owner * pmc_allocate_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; /* allocate space for N pointers and one descriptor struct */ po = malloc(sizeof(struct pmc_owner), M_PMC, M_WAITOK|M_ZERO); po->po_owner = p; LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */ TAILQ_INIT(&po->po_logbuffers); mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN); PMCDBG4(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p", p, p->p_pid, p->p_comm, po); return po; } static void pmc_destroy_owner_descriptor(struct pmc_owner *po) { PMCDBG4(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)", po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); mtx_destroy(&po->po_mtx); free(po, M_PMC); } /* * Allocate a thread descriptor from the free pool. * * NOTE: This *can* return NULL. */ static struct pmc_thread * pmc_thread_descriptor_pool_alloc(void) { struct pmc_thread *pt; mtx_lock_spin(&pmc_threadfreelist_mtx); if ((pt = LIST_FIRST(&pmc_threadfreelist)) != NULL) { LIST_REMOVE(pt, pt_next); pmc_threadfreelist_entries--; } mtx_unlock_spin(&pmc_threadfreelist_mtx); return (pt); } /* * Add a thread descriptor to the free pool. We use this instead of free() * to maintain a cache of free entries. Additionally, we can safely call * this function when we cannot call free(), such as in a critical section. * */ static void pmc_thread_descriptor_pool_free(struct pmc_thread *pt) { if (pt == NULL) return; memset(pt, 0, THREADENTRY_SIZE); mtx_lock_spin(&pmc_threadfreelist_mtx); LIST_INSERT_HEAD(&pmc_threadfreelist, pt, pt_next); pmc_threadfreelist_entries++; if (pmc_threadfreelist_entries > pmc_threadfreelist_max) GROUPTASK_ENQUEUE(&free_gtask); mtx_unlock_spin(&pmc_threadfreelist_mtx); } /* * A callout to manage the free list. */ static void pmc_thread_descriptor_pool_free_task(void *arg __unused) { struct pmc_thread *pt; LIST_HEAD(, pmc_thread) tmplist; int delta; LIST_INIT(&tmplist); /* Determine what changes, if any, we need to make. */ mtx_lock_spin(&pmc_threadfreelist_mtx); delta = pmc_threadfreelist_entries - pmc_threadfreelist_max; while (delta > 0 && (pt = LIST_FIRST(&pmc_threadfreelist)) != NULL) { delta--; LIST_REMOVE(pt, pt_next); LIST_INSERT_HEAD(&tmplist, pt, pt_next); } mtx_unlock_spin(&pmc_threadfreelist_mtx); /* If there are entries to free, free them. */ while (!LIST_EMPTY(&tmplist)) { pt = LIST_FIRST(&tmplist); LIST_REMOVE(pt, pt_next); free(pt, M_PMC); } } /* * Drain the thread free pool, freeing all allocations. */ static void pmc_thread_descriptor_pool_drain() { struct pmc_thread *pt, *next; LIST_FOREACH_SAFE(pt, &pmc_threadfreelist, pt_next, next) { LIST_REMOVE(pt, pt_next); free(pt, M_PMC); } } /* * find the descriptor corresponding to thread 'td', adding or removing it * as specified by 'mode'. * * Note that this supports additional mode flags in addition to those * supported by pmc_find_process_descriptor(): * PMC_FLAG_NOWAIT: Causes the function to not wait for mallocs. * This makes it safe to call while holding certain other locks. */ static struct pmc_thread * pmc_find_thread_descriptor(struct pmc_process *pp, struct thread *td, uint32_t mode) { struct pmc_thread *pt = NULL, *ptnew = NULL; int wait_flag; KASSERT(td != NULL, ("[pmc,%d] called to add NULL td", __LINE__)); /* * Pre-allocate memory in the PMC_FLAG_ALLOCATE case prior to * acquiring the lock. */ if (mode & PMC_FLAG_ALLOCATE) { if ((ptnew = pmc_thread_descriptor_pool_alloc()) == NULL) { wait_flag = M_WAITOK; - if ((mode & PMC_FLAG_NOWAIT) || in_epoch()) + if ((mode & PMC_FLAG_NOWAIT) || in_epoch(global_epoch_preempt)) wait_flag = M_NOWAIT; ptnew = malloc(THREADENTRY_SIZE, M_PMC, wait_flag|M_ZERO); } } mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt, &pp->pp_tds, pt_next) if (pt->pt_td == td) break; if ((mode & PMC_FLAG_REMOVE) && pt != NULL) LIST_REMOVE(pt, pt_next); if ((mode & PMC_FLAG_ALLOCATE) && pt == NULL && ptnew != NULL) { pt = ptnew; ptnew = NULL; pt->pt_td = td; LIST_INSERT_HEAD(&pp->pp_tds, pt, pt_next); } mtx_unlock_spin(pp->pp_tdslock); if (ptnew != NULL) { free(ptnew, M_PMC); } return pt; } /* * Try to add thread descriptors for each thread in a process. */ static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp) { struct thread *curtd; struct pmc_thread **tdlist; int i, tdcnt, tdlistsz; KASSERT(!PROC_LOCKED(p), ("[pmc,%d] proc unexpectedly locked", __LINE__)); tdcnt = 32; restart: tdlistsz = roundup2(tdcnt, 32); tdcnt = 0; tdlist = malloc(sizeof(struct pmc_thread*) * tdlistsz, M_TEMP, M_WAITOK); PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, curtd) tdcnt++; if (tdcnt >= tdlistsz) { PROC_UNLOCK(p); free(tdlist, M_TEMP); goto restart; } /* * Try to add each thread to the list without sleeping. If unable, * add to a queue to retry after dropping the process lock. */ tdcnt = 0; FOREACH_THREAD_IN_PROC(p, curtd) { tdlist[tdcnt] = pmc_find_thread_descriptor(pp, curtd, PMC_FLAG_ALLOCATE|PMC_FLAG_NOWAIT); if (tdlist[tdcnt] == NULL) { PROC_UNLOCK(p); for (i = 0; i <= tdcnt; i++) pmc_thread_descriptor_pool_free(tdlist[i]); free(tdlist, M_TEMP); goto restart; } tdcnt++; } PROC_UNLOCK(p); free(tdlist, M_TEMP); } /* * find the descriptor corresponding to process 'p', adding or removing it * as specified by 'mode'. */ static struct pmc_process * pmc_find_process_descriptor(struct proc *p, uint32_t mode) { uint32_t hindex; struct pmc_process *pp, *ppnew; struct pmc_processhash *pph; hindex = PMC_HASH_PTR(p, pmc_processhashmask); pph = &pmc_processhash[hindex]; ppnew = NULL; /* * Pre-allocate memory in the PMC_FLAG_ALLOCATE case since we * cannot call malloc(9) once we hold a spin lock. */ if (mode & PMC_FLAG_ALLOCATE) ppnew = malloc(sizeof(struct pmc_process) + md->pmd_npmc * sizeof(struct pmc_targetstate), M_PMC, M_WAITOK|M_ZERO); mtx_lock_spin(&pmc_processhash_mtx); LIST_FOREACH(pp, pph, pp_next) if (pp->pp_proc == p) break; if ((mode & PMC_FLAG_REMOVE) && pp != NULL) LIST_REMOVE(pp, pp_next); if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL && ppnew != NULL) { ppnew->pp_proc = p; LIST_INIT(&ppnew->pp_tds); ppnew->pp_tdslock = mtx_pool_find(pmc_mtxpool, ppnew); LIST_INSERT_HEAD(pph, ppnew, pp_next); mtx_unlock_spin(&pmc_processhash_mtx); pp = ppnew; ppnew = NULL; /* Add thread descriptors for this process' current threads. */ pmc_add_thread_descriptors_from_proc(p, pp); } else mtx_unlock_spin(&pmc_processhash_mtx); if (ppnew != NULL) free(ppnew, M_PMC); return pp; } /* * remove a process descriptor from the process hash table. */ static void pmc_remove_process_descriptor(struct pmc_process *pp) { KASSERT(pp->pp_refcnt == 0, ("[pmc,%d] Removing process descriptor %p with count %d", __LINE__, pp, pp->pp_refcnt)); mtx_lock_spin(&pmc_processhash_mtx); LIST_REMOVE(pp, pp_next); mtx_unlock_spin(&pmc_processhash_mtx); } /* * destroy a process descriptor. */ static void pmc_destroy_process_descriptor(struct pmc_process *pp) { struct pmc_thread *pmc_td; while ((pmc_td = LIST_FIRST(&pp->pp_tds)) != NULL) { LIST_REMOVE(pmc_td, pt_next); pmc_thread_descriptor_pool_free(pmc_td); } free(pp, M_PMC); } /* * find an owner descriptor corresponding to proc 'p' */ static struct pmc_owner * pmc_find_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; po = NULL; LIST_FOREACH(po, poh, po_next) if (po->po_owner == p) break; PMCDBG5(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> " "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po); return po; } /* * pmc_allocate_pmc_descriptor * * Allocate a pmc descriptor and initialize its * fields. */ static struct pmc * pmc_allocate_pmc_descriptor(void) { struct pmc *pmc; pmc = malloc(sizeof(struct pmc), M_PMC, M_WAITOK|M_ZERO); pmc->pm_runcount = counter_u64_alloc(M_WAITOK); pmc->pm_pcpu_state = malloc(sizeof(struct pmc_pcpu_state)*mp_ncpus, M_PMC, M_WAITOK|M_ZERO); PMCDBG1(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc); return pmc; } /* * Destroy a pmc descriptor. */ static void pmc_destroy_pmc_descriptor(struct pmc *pm) { KASSERT(pm->pm_state == PMC_STATE_DELETED || pm->pm_state == PMC_STATE_FREE, ("[pmc,%d] destroying non-deleted PMC", __LINE__)); KASSERT(LIST_EMPTY(&pm->pm_targets), ("[pmc,%d] destroying pmc with targets", __LINE__)); KASSERT(pm->pm_owner == NULL, ("[pmc,%d] destroying pmc attached to an owner", __LINE__)); KASSERT(counter_u64_fetch(pm->pm_runcount) == 0, ("[pmc,%d] pmc has non-zero run count %ld", __LINE__, (unsigned long)counter_u64_fetch(pm->pm_runcount))); counter_u64_free(pm->pm_runcount); free(pm->pm_pcpu_state, M_PMC); free(pm, M_PMC); } static void pmc_wait_for_pmc_idle(struct pmc *pm) { #ifdef HWPMC_DEBUG volatile int maxloop; maxloop = 100 * pmc_cpu_max(); #endif /* * Loop (with a forced context switch) till the PMC's runcount * comes down to zero. */ pmclog_flush(pm->pm_owner, 1); while (counter_u64_fetch(pm->pm_runcount) > 0) { pmclog_flush(pm->pm_owner, 1); #ifdef HWPMC_DEBUG maxloop--; KASSERT(maxloop > 0, ("[pmc,%d] (ri%d, rc%ld) waiting too long for " "pmc to be free", __LINE__, PMC_TO_ROWINDEX(pm), (unsigned long)counter_u64_fetch(pm->pm_runcount))); #endif pmc_force_context_switch(); } } /* * This function does the following things: * * - detaches the PMC from hardware * - unlinks all target threads that were attached to it * - removes the PMC from its owner's list * - destroys the PMC private mutex * * Once this function completes, the given pmc pointer can be freed by * calling pmc_destroy_pmc_descriptor(). */ static void pmc_release_pmc_descriptor(struct pmc *pm) { enum pmc_mode mode; struct pmc_hw *phw; u_int adjri, ri, cpu; struct pmc_owner *po; struct pmc_binding pb; struct pmc_process *pp; struct pmc_classdep *pcd; struct pmc_target *ptgt, *tmp; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm, ("[pmc,%d] null pmc", __LINE__)); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); mode = PMC_TO_MODE(pm); PMCDBG3(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri, mode); /* * First, we take the PMC off hardware. */ cpu = 0; if (PMC_IS_SYSTEM_MODE(mode)) { /* * A system mode PMC runs on a specific CPU. Switch * to this CPU and turn hardware off. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); pmc_select_cpu(cpu); /* switch off non-stalled CPUs */ pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (pm->pm_state == PMC_STATE_RUNNING && pm->pm_pcpu_state[cpu].pps_stalled == 0) { phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == pm, ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)", __LINE__, ri, phw->phw_pmc, pm)); PMCDBG2(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri); critical_enter(); pcd->pcd_stop_pmc(cpu, adjri); critical_exit(); } PMCDBG2(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri); critical_enter(); pcd->pcd_config_pmc(cpu, adjri, NULL); critical_exit(); /* adjust the global and process count of SS mode PMCs */ if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) { po = pm->pm_owner; po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); CK_LIST_REMOVE(po, po_ssnext); epoch_wait_preempt(global_epoch_preempt); } } pm->pm_state = PMC_STATE_DELETED; pmc_restore_cpu_binding(&pb); /* * We could have references to this PMC structure in * the per-cpu sample queues. Wait for the queue to * drain. */ pmc_wait_for_pmc_idle(pm); } else if (PMC_IS_VIRTUAL_MODE(mode)) { /* * A virtual PMC could be running on multiple CPUs at * a given instant. * * By marking its state as DELETED, we ensure that * this PMC is never further scheduled on hardware. * * Then we wait till all CPUs are done with this PMC. */ pm->pm_state = PMC_STATE_DELETED; /* Wait for the PMCs runcount to come to zero. */ pmc_wait_for_pmc_idle(pm); /* * At this point the PMC is off all CPUs and cannot be * freshly scheduled onto a CPU. It is now safe to * unlink all targets from this PMC. If a * process-record's refcount falls to zero, we remove * it from the hash table. The module-wide SX lock * protects us from races. */ LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) { pp = ptgt->pt_process; pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */ PMCDBG1(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt); /* * If the target process record shows that no * PMCs are attached to it, reclaim its space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); pmc_destroy_process_descriptor(pp); } } cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */ } /* * Release any MD resources */ (void) pcd->pcd_release_pmc(cpu, adjri, pm); /* * Update row disposition */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) PMC_UNMARK_ROW_STANDALONE(ri); else PMC_UNMARK_ROW_THREAD(ri); /* unlink from the owner's list */ if (pm->pm_owner) { LIST_REMOVE(pm, pm_next); pm->pm_owner = NULL; } } /* * Register an owner and a pmc. */ static int pmc_register_owner(struct proc *p, struct pmc *pmc) { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) return ENOMEM; KASSERT(pmc->pm_owner == NULL, ("[pmc,%d] attempting to own an initialized PMC", __LINE__)); pmc->pm_owner = po; LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next); PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcallocate(pmc); PMCDBG2(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p", po, pmc); return 0; } /* * Return the current row disposition: * == 0 => FREE * > 0 => PROCESS MODE * < 0 => SYSTEM MODE */ int pmc_getrowdisp(int ri) { return pmc_pmcdisp[ri]; } /* * Check if a PMC at row index 'ri' can be allocated to the current * process. * * Allocation can fail if: * - the current process is already being profiled by a PMC at index 'ri', * attached to it via OP_PMCATTACH. * - the current process has already allocated a PMC at index 'ri' * via OP_ALLOCATE. */ static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu) { enum pmc_mode mode; struct pmc *pm; struct pmc_owner *po; struct pmc_process *pp; PMCDBG5(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d " "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu); /* * We shouldn't have already allocated a process-mode PMC at * row index 'ri'. * * We shouldn't have allocated a system-wide PMC on the same * CPU and same RI. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) LIST_FOREACH(pm, &po->po_pmcs, pm_next) { if (PMC_TO_ROWINDEX(pm) == ri) { mode = PMC_TO_MODE(pm); if (PMC_IS_VIRTUAL_MODE(mode)) return EEXIST; if (PMC_IS_SYSTEM_MODE(mode) && (int) PMC_TO_CPU(pm) == cpu) return EEXIST; } } /* * We also shouldn't be the target of any PMC at this index * since otherwise a PMC_ATTACH to ourselves will fail. */ if ((pp = pmc_find_process_descriptor(p, 0)) != NULL) if (pp->pp_pmcs[ri].pp_pmc) return EEXIST; PMCDBG4(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok", p, p->p_pid, p->p_comm, ri); return 0; } /* * Check if a given PMC at row index 'ri' can be currently used in * mode 'mode'. */ static int pmc_can_allocate_row(int ri, enum pmc_mode mode) { enum pmc_disp disp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG2(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode); if (PMC_IS_SYSTEM_MODE(mode)) disp = PMC_DISP_STANDALONE; else disp = PMC_DISP_THREAD; /* * check disposition for PMC row 'ri': * * Expected disposition Row-disposition Result * * STANDALONE STANDALONE or FREE proceed * STANDALONE THREAD fail * THREAD THREAD or FREE proceed * THREAD STANDALONE fail */ if (!PMC_ROW_DISP_IS_FREE(ri) && !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) && !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri))) return EBUSY; /* * All OK */ PMCDBG2(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode); return 0; } /* * Find a PMC descriptor with user handle 'pmcid' for thread 'td'. */ static struct pmc * pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid) { struct pmc *pm; KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc, ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__, PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc)); LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_id == pmcid) return pm; return NULL; } static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc) { struct pmc *pm, *opm; struct pmc_owner *po; struct pmc_process *pp; PMCDBG1(PMC,FND,1, "find-pmc id=%d", pmcid); if (PMC_ID_TO_ROWINDEX(pmcid) >= md->pmd_npmc) return (EINVAL); if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL) { /* * In case of PMC_F_DESCENDANTS child processes we will not find * the current process in the owners hash list. Find the owner * process first and from there lookup the po. */ if ((pp = pmc_find_process_descriptor(curthread->td_proc, PMC_FLAG_NONE)) == NULL) { return ESRCH; } else { opm = pp->pp_pmcs[PMC_ID_TO_ROWINDEX(pmcid)].pp_pmc; if (opm == NULL) return ESRCH; if ((opm->pm_flags & (PMC_F_ATTACHED_TO_OWNER| PMC_F_DESCENDANTS)) != (PMC_F_ATTACHED_TO_OWNER| PMC_F_DESCENDANTS)) return ESRCH; po = opm->pm_owner; } } if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL) return EINVAL; PMCDBG2(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm); *pmc = pm; return 0; } /* * Start a PMC. */ static int pmc_start(struct pmc *pm) { enum pmc_mode mode; struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; int adjri, error, cpu, ri; KASSERT(pm != NULL, ("[pmc,%d] null pm", __LINE__)); mode = PMC_TO_MODE(pm); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); error = 0; PMCDBG3(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri); po = pm->pm_owner; /* * Disallow PMCSTART if a logfile is required but has not been * configured yet. */ if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) && (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return (EDOOFUS); /* programming error */ /* * If this is a sampling mode PMC, log mapping information for * the kernel modules that are currently loaded. */ if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_kernel_mappings(pm); if (PMC_IS_VIRTUAL_MODE(mode)) { /* * If a PMCATTACH has never been done on this PMC, * attach it to its owner process. */ if (LIST_EMPTY(&pm->pm_targets)) error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH : pmc_attach_process(po->po_owner, pm); /* * If the PMC is attached to its owner, then force a context * switch to ensure that the MD state gets set correctly. */ if (error == 0) { pm->pm_state = PMC_STATE_RUNNING; if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) pmc_force_context_switch(); } return (error); } /* * A system-wide PMC. * * Add the owner to the global list if this is a system-wide * sampling PMC. */ if (mode == PMC_MODE_SS) { /* * Log mapping information for all existing processes in the * system. Subsequent mappings are logged as they happen; * see pmc_process_mmap(). */ if (po->po_logprocmaps == 0) { pmc_log_all_process_mappings(po); po->po_logprocmaps = 1; } po->po_sscount++; if (po->po_sscount == 1) { atomic_add_rel_int(&pmc_ss_count, 1); CK_LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext); PMCDBG1(PMC,OPS,1, "po=%p in global list", po); } } /* * Move to the CPU associated with this * PMC, and start the hardware. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); if (!pmc_cpu_is_active(cpu)) return (ENXIO); pmc_select_cpu(cpu); /* * global PMCs are configured at allocation time * so write out the initial value and start the PMC. */ pm->pm_state = PMC_STATE_RUNNING; critical_enter(); if ((error = pcd->pcd_write_pmc(cpu, adjri, PMC_IS_SAMPLING_MODE(mode) ? pm->pm_sc.pm_reloadcount : pm->pm_sc.pm_initial)) == 0) { /* If a sampling mode PMC, reset stalled state. */ if (PMC_IS_SAMPLING_MODE(mode)) pm->pm_pcpu_state[cpu].pps_stalled = 0; /* Indicate that we desire this to run. Start it. */ pm->pm_pcpu_state[cpu].pps_cpustate = 1; error = pcd->pcd_start_pmc(cpu, adjri); } critical_exit(); pmc_restore_cpu_binding(&pb); return (error); } /* * Stop a PMC. */ static int pmc_stop(struct pmc *pm) { struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; int adjri, cpu, error, ri; KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__)); PMCDBG3(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm, PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm)); pm->pm_state = PMC_STATE_STOPPED; /* * If the PMC is a virtual mode one, changing the state to * non-RUNNING is enough to ensure that the PMC never gets * scheduled. * * If this PMC is current running on a CPU, then it will * handled correctly at the time its target process is context * switched out. */ if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) return 0; /* * A system-mode PMC. Move to the CPU associated with * this PMC, and stop the hardware. We update the * 'initial count' so that a subsequent PMCSTART will * resume counting from the current hardware count. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] illegal cpu=%d", __LINE__, cpu)); if (!pmc_cpu_is_active(cpu)) return ENXIO; pmc_select_cpu(cpu); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); pm->pm_pcpu_state[cpu].pps_cpustate = 0; critical_enter(); if ((error = pcd->pcd_stop_pmc(cpu, adjri)) == 0) error = pcd->pcd_read_pmc(cpu, adjri, &pm->pm_sc.pm_initial); critical_exit(); pmc_restore_cpu_binding(&pb); po = pm->pm_owner; /* remove this owner from the global list of SS PMC owners */ if (PMC_TO_MODE(pm) == PMC_MODE_SS) { po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); CK_LIST_REMOVE(po, po_ssnext); epoch_wait_preempt(global_epoch_preempt); PMCDBG1(PMC,OPS,2,"po=%p removed from global list", po); } } return (error); } static struct pmc_classdep * pmc_class_to_classdep(enum pmc_class class) { int n; for (n = 0; n < md->pmd_nclass; n++) if (md->pmd_classdep[n].pcd_class == class) return (&md->pmd_classdep[n]); return (NULL); } #if defined(HWPMC_DEBUG) && defined(KTR) static const char *pmc_op_to_name[] = { #undef __PMC_OP #define __PMC_OP(N, D) #N , __PMC_OPS() NULL }; #endif /* * The syscall interface */ #define PMC_GET_SX_XLOCK(...) do { \ sx_xlock(&pmc_sx); \ if (pmc_hook == NULL) { \ sx_xunlock(&pmc_sx); \ return __VA_ARGS__; \ } \ } while (0) #define PMC_DOWNGRADE_SX() do { \ sx_downgrade(&pmc_sx); \ is_sx_downgraded = 1; \ } while (0) static int pmc_syscall_handler(struct thread *td, void *syscall_args) { int error, is_sx_downgraded, op; struct pmc_syscall_args *c; void *pmclog_proc_handle; void *arg; c = (struct pmc_syscall_args *)syscall_args; op = c->pmop_code; arg = c->pmop_data; /* PMC isn't set up yet */ if (pmc_hook == NULL) return (EINVAL); if (op == PMC_OP_CONFIGURELOG) { /* * We cannot create the logging process inside * pmclog_configure_log() because there is a LOR * between pmc_sx and process structure locks. * Instead, pre-create the process and ignite the loop * if everything is fine, otherwise direct the process * to exit. */ error = pmclog_proc_create(td, &pmclog_proc_handle); if (error != 0) goto done_syscall; } PMC_GET_SX_XLOCK(ENOSYS); is_sx_downgraded = 0; PMCDBG3(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op, pmc_op_to_name[op], arg); error = 0; counter_u64_add(pmc_stats.pm_syscalls, 1); switch (op) { /* * Configure a log file. * * XXX This OP will be reworked. */ case PMC_OP_CONFIGURELOG: { struct proc *p; struct pmc *pm; struct pmc_owner *po; struct pmc_op_configurelog cl; if ((error = copyin(arg, &cl, sizeof(cl))) != 0) { pmclog_proc_ignite(pmclog_proc_handle, NULL); break; } /* mark this process as owning a log file */ p = td->td_proc; if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = ENOMEM; break; } /* * If a valid fd was passed in, try to configure that, * otherwise if 'fd' was less than zero and there was * a log file configured, flush its buffers and * de-configure it. */ if (cl.pm_logfd >= 0) { error = pmclog_configure_log(md, po, cl.pm_logfd); pmclog_proc_ignite(pmclog_proc_handle, error == 0 ? po : NULL); } else if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = pmclog_close(po); if (error == 0) { LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && pm->pm_state == PMC_STATE_RUNNING) pmc_stop(pm); error = pmclog_deconfigure_log(po); } } else { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = EINVAL; } } break; /* * Flush a log file. */ case PMC_OP_FLUSHLOG: { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } error = pmclog_flush(po, 0); } break; /* * Close a log file. */ case PMC_OP_CLOSELOG: { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } error = pmclog_close(po); } break; /* * Retrieve hardware configuration. */ case PMC_OP_GETCPUINFO: /* CPU information */ { struct pmc_op_getcpuinfo gci; struct pmc_classinfo *pci; struct pmc_classdep *pcd; int cl; gci.pm_cputype = md->pmd_cputype; gci.pm_ncpu = pmc_cpu_max(); gci.pm_npmc = md->pmd_npmc; gci.pm_nclass = md->pmd_nclass; pci = gci.pm_classes; pcd = md->pmd_classdep; for (cl = 0; cl < md->pmd_nclass; cl++, pci++, pcd++) { pci->pm_caps = pcd->pcd_caps; pci->pm_class = pcd->pcd_class; pci->pm_width = pcd->pcd_width; pci->pm_num = pcd->pcd_num; } error = copyout(&gci, arg, sizeof(gci)); } break; /* * Retrieve soft events list. */ case PMC_OP_GETDYNEVENTINFO: { enum pmc_class cl; enum pmc_event ev; struct pmc_op_getdyneventinfo *gei; struct pmc_dyn_event_descr dev; struct pmc_soft *ps; uint32_t nevent; sx_assert(&pmc_sx, SX_LOCKED); gei = (struct pmc_op_getdyneventinfo *) arg; if ((error = copyin(&gei->pm_class, &cl, sizeof(cl))) != 0) break; /* Only SOFT class is dynamic. */ if (cl != PMC_CLASS_SOFT) { error = EINVAL; break; } nevent = 0; for (ev = PMC_EV_SOFT_FIRST; (int)ev <= PMC_EV_SOFT_LAST; ev++) { ps = pmc_soft_ev_acquire(ev); if (ps == NULL) continue; bcopy(&ps->ps_ev, &dev, sizeof(dev)); pmc_soft_ev_release(ps); error = copyout(&dev, &gei->pm_events[nevent], sizeof(struct pmc_dyn_event_descr)); if (error != 0) break; nevent++; } if (error != 0) break; error = copyout(&nevent, &gei->pm_nevent, sizeof(nevent)); } break; /* * Get module statistics */ case PMC_OP_GETDRIVERSTATS: { struct pmc_op_getdriverstats gms; #define CFETCH(a, b, field) a.field = counter_u64_fetch(b.field) CFETCH(gms, pmc_stats, pm_intr_ignored); CFETCH(gms, pmc_stats, pm_intr_processed); CFETCH(gms, pmc_stats, pm_intr_bufferfull); CFETCH(gms, pmc_stats, pm_syscalls); CFETCH(gms, pmc_stats, pm_syscall_errors); CFETCH(gms, pmc_stats, pm_buffer_requests); CFETCH(gms, pmc_stats, pm_buffer_requests_failed); CFETCH(gms, pmc_stats, pm_log_sweeps); #undef CFETCH error = copyout(&gms, arg, sizeof(gms)); } break; /* * Retrieve module version number */ case PMC_OP_GETMODULEVERSION: { uint32_t cv, modv; /* retrieve the client's idea of the ABI version */ if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0) break; /* don't service clients newer than our driver */ modv = PMC_VERSION; if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) { error = EPROGMISMATCH; break; } error = copyout(&modv, arg, sizeof(int)); } break; /* * Retrieve the state of all the PMCs on a given * CPU. */ case PMC_OP_GETPMCINFO: { int ari; struct pmc *pm; size_t pmcinfo_size; uint32_t cpu, n, npmc; struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; struct pmc_info *p, *pmcinfo; struct pmc_op_getpmcinfo *gpi; PMC_DOWNGRADE_SX(); gpi = (struct pmc_op_getpmcinfo *) arg; if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0) break; if (cpu >= pmc_cpu_max()) { error = EINVAL; break; } if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* switch to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); npmc = md->pmd_npmc; pmcinfo_size = npmc * sizeof(struct pmc_info); pmcinfo = malloc(pmcinfo_size, M_PMC, M_WAITOK); p = pmcinfo; for (n = 0; n < md->pmd_npmc; n++, p++) { pcd = pmc_ri_to_classdep(md, n, &ari); KASSERT(pcd != NULL, ("[pmc,%d] null pcd ri=%d", __LINE__, n)); if ((error = pcd->pcd_describe(cpu, ari, p, &pm)) != 0) break; if (PMC_ROW_DISP_IS_STANDALONE(n)) p->pm_rowdisp = PMC_DISP_STANDALONE; else if (PMC_ROW_DISP_IS_THREAD(n)) p->pm_rowdisp = PMC_DISP_THREAD; else p->pm_rowdisp = PMC_DISP_FREE; p->pm_ownerpid = -1; if (pm == NULL) /* no PMC associated */ continue; po = pm->pm_owner; KASSERT(po->po_owner != NULL, ("[pmc,%d] pmc_owner had a null proc pointer", __LINE__)); p->pm_ownerpid = po->po_owner->p_pid; p->pm_mode = PMC_TO_MODE(pm); p->pm_event = pm->pm_event; p->pm_flags = pm->pm_flags; if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) p->pm_reloadcount = pm->pm_sc.pm_reloadcount; } pmc_restore_cpu_binding(&pb); /* now copy out the PMC info collected */ if (error == 0) error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size); free(pmcinfo, M_PMC); } break; /* * Set the administrative state of a PMC. I.e. whether * the PMC is to be used or not. */ case PMC_OP_PMCADMIN: { int cpu, ri; enum pmc_state request; struct pmc_cpu *pc; struct pmc_hw *phw; struct pmc_op_pmcadmin pma; struct pmc_binding pb; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); error = priv_check(td, PRIV_PMC_MANAGE); if (error) break; if ((error = copyin(arg, &pma, sizeof(pma))) != 0) break; cpu = pma.pm_cpu; if (cpu < 0 || cpu >= (int) pmc_cpu_max()) { error = EINVAL; break; } if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } request = pma.pm_state; if (request != PMC_STATE_DISABLED && request != PMC_STATE_FREE) { error = EINVAL; break; } ri = pma.pm_pmc; /* pmc id == row index */ if (ri < 0 || ri >= (int) md->pmd_npmc) { error = EINVAL; break; } /* * We can't disable a PMC with a row-index allocated * for process virtual PMCs. */ if (PMC_ROW_DISP_IS_THREAD(ri) && request == PMC_STATE_DISABLED) { error = EBUSY; break; } /* * otherwise, this PMC on this CPU is either free or * in system-wide mode. */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); pc = pmc_pcpu[cpu]; phw = pc->pc_hwpmcs[ri]; /* * XXX do we need some kind of 'forced' disable? */ if (phw->phw_pmc == NULL) { if (request == PMC_STATE_DISABLED && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) { phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED; PMC_MARK_ROW_STANDALONE(ri); } else if (request == PMC_STATE_FREE && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) { phw->phw_state |= PMC_PHW_FLAG_IS_ENABLED; PMC_UNMARK_ROW_STANDALONE(ri); } /* other cases are a no-op */ } else error = EBUSY; pmc_restore_cpu_binding(&pb); } break; /* * Allocate a PMC. */ case PMC_OP_PMCALLOCATE: { int adjri, n; u_int cpu; uint32_t caps; struct pmc *pmc; enum pmc_mode mode; struct pmc_hw *phw; struct pmc_binding pb; struct pmc_classdep *pcd; struct pmc_op_pmcallocate pa; if ((error = copyin(arg, &pa, sizeof(pa))) != 0) break; caps = pa.pm_caps; mode = pa.pm_mode; cpu = pa.pm_cpu; if ((mode != PMC_MODE_SS && mode != PMC_MODE_SC && mode != PMC_MODE_TS && mode != PMC_MODE_TC) || (cpu != (u_int) PMC_CPU_ANY && cpu >= pmc_cpu_max())) { error = EINVAL; break; } /* * Virtual PMCs should only ask for a default CPU. * System mode PMCs need to specify a non-default CPU. */ if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) || (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) { error = EINVAL; break; } /* * Check that an inactive CPU is not being asked for. */ if (PMC_IS_SYSTEM_MODE(mode) && !pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* * Refuse an allocation for a system-wide PMC if this * process has been jailed, or if this process lacks * super-user credentials and the sysctl tunable * 'security.bsd.unprivileged_syspmcs' is zero. */ if (PMC_IS_SYSTEM_MODE(mode)) { if (jailed(curthread->td_ucred)) { error = EPERM; break; } if (!pmc_unprivileged_syspmcs) { error = priv_check(curthread, PRIV_PMC_SYSTEM); if (error) break; } } /* * Look for valid values for 'pm_flags' */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) != 0) { error = EINVAL; break; } /* PMC_F_USERCALLCHAIN is only valid with PMC_F_CALLCHAIN */ if ((pa.pm_flags & (PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) == PMC_F_USERCALLCHAIN) { error = EINVAL; break; } /* PMC_F_USERCALLCHAIN is only valid for sampling mode */ if (pa.pm_flags & PMC_F_USERCALLCHAIN && mode != PMC_MODE_TS && mode != PMC_MODE_SS) { error = EINVAL; break; } /* process logging options are not allowed for system PMCs */ if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags & (PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) { error = EINVAL; break; } /* * All sampling mode PMCs need to be able to interrupt the * CPU. */ if (PMC_IS_SAMPLING_MODE(mode)) caps |= PMC_CAP_INTERRUPT; /* A valid class specifier should have been passed in. */ pcd = pmc_class_to_classdep(pa.pm_class); if (pcd == NULL) { error = EINVAL; break; } /* The requested PMC capabilities should be feasible. */ if ((pcd->pcd_caps & caps) != caps) { error = EOPNOTSUPP; break; } PMCDBG4(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d", pa.pm_ev, caps, mode, cpu); pmc = pmc_allocate_pmc_descriptor(); pmc->pm_id = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class, PMC_ID_INVALID); pmc->pm_event = pa.pm_ev; pmc->pm_state = PMC_STATE_FREE; pmc->pm_caps = caps; pmc->pm_flags = pa.pm_flags; /* XXX set lower bound on sampling for process counters */ if (PMC_IS_SAMPLING_MODE(mode)) pmc->pm_sc.pm_reloadcount = pa.pm_count; else pmc->pm_sc.pm_initial = pa.pm_count; /* switch thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); #define PMC_IS_SHAREABLE_PMC(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state & \ PMC_PHW_FLAG_IS_SHAREABLE) #define PMC_IS_UNALLOCATED(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL) if (PMC_IS_SYSTEM_MODE(mode)) { pmc_select_cpu(cpu); for (n = pcd->pcd_ri; n < (int) md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, cpu) == 0 && (PMC_IS_UNALLOCATED(cpu, n) || PMC_IS_SHAREABLE_PMC(cpu, n)) && pcd->pcd_allocate_pmc(cpu, adjri, pmc, &pa) == 0) break; } } else { /* Process virtual mode */ for (n = pcd->pcd_ri; n < (int) md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, PMC_CPU_ANY) == 0 && pcd->pcd_allocate_pmc(curthread->td_oncpu, adjri, pmc, &pa) == 0) break; } } #undef PMC_IS_UNALLOCATED #undef PMC_IS_SHAREABLE_PMC pmc_restore_cpu_binding(&pb); if (n == (int) md->pmd_npmc) { pmc_destroy_pmc_descriptor(pmc); pmc = NULL; error = EINVAL; break; } /* Fill in the correct value in the ID field */ pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n); PMCDBG5(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x", pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id); /* Process mode PMCs with logging enabled need log files */ if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* All system mode sampling PMCs require a log file */ if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* * Configure global pmc's immediately */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) { pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); phw = pmc_pcpu[cpu]->pc_hwpmcs[n]; pcd = pmc_ri_to_classdep(md, n, &adjri); if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 || (error = pcd->pcd_config_pmc(cpu, adjri, pmc)) != 0) { (void) pcd->pcd_release_pmc(cpu, adjri, pmc); pmc_destroy_pmc_descriptor(pmc); pmc = NULL; pmc_restore_cpu_binding(&pb); error = EPERM; break; } pmc_restore_cpu_binding(&pb); } pmc->pm_state = PMC_STATE_ALLOCATED; pmc->pm_class = pa.pm_class; /* * mark row disposition */ if (PMC_IS_SYSTEM_MODE(mode)) PMC_MARK_ROW_STANDALONE(n); else PMC_MARK_ROW_THREAD(n); /* * Register this PMC with the current thread as its owner. */ if ((error = pmc_register_owner(curthread->td_proc, pmc)) != 0) { pmc_release_pmc_descriptor(pmc); pmc_destroy_pmc_descriptor(pmc); pmc = NULL; break; } /* * Return the allocated index. */ pa.pm_pmcid = pmc->pm_id; error = copyout(&pa, arg, sizeof(pa)); } break; /* * Attach a PMC to a process. */ case PMC_OP_PMCATTACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { error = EINVAL; break; } /* PMCs may be (re)attached only when allocated or stopped */ if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } else if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED) { error = EINVAL; break; } /* lookup pid */ if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Ignore processes that are working on exiting. */ if (p->p_flag & P_WEXIT) { error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ break; } /* * we are allowed to attach a PMC to a process if * we can debug it. */ error = p_candebug(curthread, p); PROC_UNLOCK(p); if (error == 0) error = pmc_attach_process(p, pm); } break; /* * Detach an attached PMC from a process. */ case PMC_OP_PMCDETACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Treat processes that are in the process of exiting * as if they were not present. */ if (p->p_flag & P_WEXIT) error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ if (error == 0) error = pmc_detach_process(p, pm); } break; /* * Retrieve the MSR number associated with the counter * 'pmc_id'. This allows processes to directly use RDPMC * instructions to read their PMCs, without the overhead of a * system call. */ case PMC_OP_PMCGETMSR: { int adjri, ri; struct pmc *pm; struct pmc_target *pt; struct pmc_op_getmsr gm; struct pmc_classdep *pcd; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &gm, sizeof(gm))) != 0) break; if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0) break; /* * The allocated PMC has to be a process virtual PMC, * i.e., of type MODE_T[CS]. Global PMCs can only be * read using the PMCREAD operation since they may be * allocated on a different CPU than the one we could * be running on at the time of the RDPMC instruction. * * The GETMSR operation is not allowed for PMCs that * are inherited across processes. */ if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) || (pm->pm_flags & PMC_F_DESCENDANTS)) { error = EINVAL; break; } /* * It only makes sense to use a RDPMC (or its * equivalent instruction on non-x86 architectures) on * a process that has allocated and attached a PMC to * itself. Conversely the PMC is only allowed to have * one process attached to it -- its owner. */ if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL || LIST_NEXT(pt, pt_next) != NULL || pt->pt_process->pp_proc != pm->pm_owner->po_owner) { error = EINVAL; break; } ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); /* PMC class has no 'GETMSR' support */ if (pcd->pcd_get_msr == NULL) { error = ENOSYS; break; } if ((error = (*pcd->pcd_get_msr)(adjri, &gm.pm_msr)) < 0) break; if ((error = copyout(&gm, arg, sizeof(gm))) < 0) break; /* * Mark our process as using MSRs. Update machine * state using a forced context switch. */ pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS; pmc_force_context_switch(); } break; /* * Release an allocated PMC */ case PMC_OP_PMCRELEASE: { pmc_id_t pmcid; struct pmc *pm; struct pmc_owner *po; struct pmc_op_simple sp; /* * Find PMC pointer for the named PMC. * * Use pmc_release_pmc_descriptor() to switch off the * PMC, remove all its target threads, and remove the * PMC from its owner's list. * * Remove the owner record if this is the last PMC * owned. * * Free up space. */ if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; po = pm->pm_owner; pmc_release_pmc_descriptor(pm); pmc_maybe_remove_owner(po); pmc_destroy_pmc_descriptor(pm); } break; /* * Read and/or write a PMC. */ case PMC_OP_PMCRW: { int adjri; struct pmc *pm; uint32_t cpu, ri; pmc_value_t oldvalue; struct pmc_binding pb; struct pmc_op_pmcrw prw; struct pmc_classdep *pcd; struct pmc_op_pmcrw *pprw; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &prw, sizeof(prw))) != 0) break; ri = 0; PMCDBG2(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid, prw.pm_flags); /* must have at least one flag set */ if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) { error = EINVAL; break; } /* locate pmc descriptor */ if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0) break; /* Can't read a PMC that hasn't been started. */ if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } /* writing a new value is allowed only for 'STOPPED' pmcs */ if (pm->pm_state == PMC_STATE_RUNNING && (prw.pm_flags & PMC_F_NEWVALUE)) { error = EBUSY; break; } if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) { /* * If this PMC is attached to its owner (i.e., * the process requesting this operation) and * is running, then attempt to get an * upto-date reading from hardware for a READ. * Writes are only allowed when the PMC is * stopped, so only update the saved value * field. * * If the PMC is not running, or is not * attached to its owner, read/write to the * savedvalue field. */ ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); mtx_pool_lock_spin(pmc_mtxpool, pm); cpu = curthread->td_oncpu; if (prw.pm_flags & PMC_F_OLDVALUE) { if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) && (pm->pm_state == PMC_STATE_RUNNING)) error = (*pcd->pcd_read_pmc)(cpu, adjri, &oldvalue); else oldvalue = pm->pm_gv.pm_savedvalue; } if (prw.pm_flags & PMC_F_NEWVALUE) pm->pm_gv.pm_savedvalue = prw.pm_value; mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { /* System mode PMCs */ cpu = PMC_TO_CPU(pm); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* move this thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); critical_enter(); /* save old value */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = (*pcd->pcd_read_pmc)(cpu, adjri, &oldvalue))) goto error; /* write out new value */ if (prw.pm_flags & PMC_F_NEWVALUE) error = (*pcd->pcd_write_pmc)(cpu, adjri, prw.pm_value); error: critical_exit(); pmc_restore_cpu_binding(&pb); if (error) break; } pprw = (struct pmc_op_pmcrw *) arg; #ifdef HWPMC_DEBUG if (prw.pm_flags & PMC_F_NEWVALUE) PMCDBG3(PMC,OPS,2, "rw id=%d new %jx -> old %jx", ri, prw.pm_value, oldvalue); else if (prw.pm_flags & PMC_F_OLDVALUE) PMCDBG2(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue); #endif /* return old value if requested */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = copyout(&oldvalue, &pprw->pm_value, sizeof(prw.pm_value)))) break; } break; /* * Set the sampling rate for a sampling mode PMC and the * initial count for a counting mode PMC. */ case PMC_OP_PMCSETCOUNT: { struct pmc *pm; struct pmc_op_pmcsetcount sc; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sc, sizeof(sc))) != 0) break; if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0) break; if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pm->pm_sc.pm_reloadcount = sc.pm_count; else pm->pm_sc.pm_initial = sc.pm_count; } break; /* * Start a PMC. */ case PMC_OP_PMCSTART: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmcid %x != id %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_RUNNING) /* already running */ break; else if (pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_ALLOCATED) { error = EINVAL; break; } error = pmc_start(pm); } break; /* * Stop a PMC. */ case PMC_OP_PMCSTOP: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; /* * Mark the PMC as inactive and invoke the MD stop * routines if needed. */ if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmc id %x != pmcid %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */ break; else if (pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } error = pmc_stop(pm); } break; /* * Write a user supplied value to the log file. */ case PMC_OP_WRITELOG: { struct pmc_op_writelog wl; struct pmc_owner *po; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &wl, sizeof(wl))) != 0) break; if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) { error = EINVAL; break; } error = pmclog_process_userlog(po, &wl); } break; default: error = EINVAL; break; } if (is_sx_downgraded) sx_sunlock(&pmc_sx); else sx_xunlock(&pmc_sx); done_syscall: if (error) counter_u64_add(pmc_stats.pm_syscall_errors, 1); return (error); } /* * Helper functions */ /* * Mark the thread as needing callchain capture and post an AST. The * actual callchain capture will be done in a context where it is safe * to take page faults. */ static void pmc_post_callchain_callback(void) { struct thread *td; td = curthread; /* * If there is multiple PMCs for the same interrupt ignore new post */ if (td->td_pflags & TDP_CALLCHAIN) return; /* * Mark this thread as needing callchain capture. * `td->td_pflags' will be safe to touch because this thread * was in user space when it was interrupted. */ td->td_pflags |= TDP_CALLCHAIN; /* * Don't let this thread migrate between CPUs until callchain * capture completes. */ sched_pin(); return; } /* * Find a free slot in the per-cpu array of samples and capture the * current callchain there. If a sample was successfully added, a bit * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook * needs to be invoked from the clock handler. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ static int pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) { int error, cpu, callchaindepth, inuserspace; struct thread *td; struct pmc_sample *ps; struct pmc_samplebuffer *psb; error = 0; /* * Allocate space for a sample buffer. */ cpu = curcpu; psb = pmc_pcpu[cpu]->pc_sb[ring]; inuserspace = TRAPF_USERMODE(tf); ps = psb->ps_write; if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { counter_u64_add(ps->ps_pmc->pm_runcount, -1); counter_u64_add(pmc_stats.pm_overwrites, 1); ps->ps_nsamples = 0; } else if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_pcpu_state[cpu].pps_stalled = 1; counter_u64_add(pmc_stats.pm_intr_bufferfull, 1); PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); callchaindepth = 1; error = ENOMEM; goto done; } /* Fill in entry. */ PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ td = curthread; ps->ps_pmc = pm; ps->ps_td = td; ps->ps_pid = td->td_proc->p_pid; ps->ps_tid = td->td_tid; ps->ps_tsc = pmc_rdtsc(); ps->ps_cpu = cpu; ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? pmc_callchaindepth : 1; if (callchaindepth == 1) ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); else { /* * Kernel stack traversals can be done immediately, * while we defer to an AST for user space traversals. */ if (!inuserspace) { callchaindepth = pmc_save_kernel_callchain(ps->ps_pc, callchaindepth, tf); } else { pmc_post_callchain_callback(); callchaindepth = PMC_SAMPLE_INUSE; } } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ if (ring == PMC_UR) { ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */ ps->ps_nsamples = PMC_SAMPLE_INUSE; } else ps->ps_nsamples = callchaindepth; /* mark entry as in use */ /* increment write pointer, modulo ring buffer size */ ps++; if (ps == psb->ps_fence) psb->ps_write = psb->ps_samples; else psb->ps_write = ps; done: /* mark CPU as needing processing */ if (callchaindepth != PMC_SAMPLE_INUSE) DPCPU_SET(pmc_sampled, 1); return (error); } /* * Interrupt processing. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ int pmc_process_interrupt(int ring, struct pmc *pm, struct trapframe *tf) { struct thread *td; td = curthread; if ((pm->pm_flags & PMC_F_USERCALLCHAIN) && (td->td_proc->p_flag & P_KPROC) == 0 && !TRAPF_USERMODE(tf)) { atomic_add_int(&td->td_pmcpend, 1); return (pmc_add_sample(PMC_UR, pm, tf)); } return (pmc_add_sample(ring, pm, tf)); } /* * Capture a user call chain. This function will be called from ast() * before control returns to userland and before the process gets * rescheduled. */ static void pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) { struct pmc *pm; struct thread *td; struct pmc_sample *ps, *ps_end; struct pmc_samplebuffer *psb; int nsamples, nrecords, pass; #ifdef INVARIANTS int ncallchains; int nfree; #endif psb = pmc_pcpu[cpu]->pc_sb[ring]; td = curthread; KASSERT(td->td_pflags & TDP_CALLCHAIN, ("[pmc,%d] Retrieving callchain for thread that doesn't want it", __LINE__)); #ifdef INVARIANTS ncallchains = 0; nfree = 0; #endif nrecords = INT_MAX; pass = 0; restart: if (ring == PMC_UR) nrecords = atomic_readandclear_32(&td->td_pmcpend); /* * Iterate through all deferred callchain requests. * Walk from the current read pointer to the current * write pointer. */ ps = psb->ps_read; ps_end = psb->ps_write; do { #ifdef INVARIANTS if (ps->ps_nsamples == PMC_SAMPLE_FREE) { nfree++; goto next; } if ((ps->ps_pmc == NULL) || (ps->ps_pmc->pm_state != PMC_STATE_RUNNING)) nfree++; #endif if (ps->ps_nsamples != PMC_SAMPLE_INUSE) goto next; if (ps->ps_td != td) goto next; KASSERT(ps->ps_cpu == cpu, ("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__, ps->ps_cpu, PCPU_GET(cpuid))); pm = ps->ps_pmc; KASSERT(pm->pm_flags & PMC_F_CALLCHAIN, ("[pmc,%d] Retrieving callchain for PMC that doesn't " "want it", __LINE__)); KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount %ld", __LINE__, (unsigned long)counter_u64_fetch(pm->pm_runcount))); if (ring == PMC_UR) { nsamples = ps->ps_nsamples_actual; counter_u64_add(pmc_stats.pm_merges, 1); } else nsamples = 0; /* * Retrieve the callchain and mark the sample buffer * as 'processable' by the timer tick sweep code. */ #ifdef INVARIANTS ncallchains++; #endif if (__predict_true(nsamples < pmc_callchaindepth - 1)) nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples, pmc_callchaindepth - nsamples - 1, tf); wmb(); ps->ps_nsamples = nsamples; if (nrecords-- == 1) break; next: /* increment the pointer, modulo sample ring size */ if (++ps == psb->ps_fence) ps = psb->ps_samples; } while (ps != ps_end); if (__predict_false(ring == PMC_UR && td->td_pmcpend)) { if (pass == 0) { pass = 1; goto restart; } /* only collect samples for this part once */ td->td_pmcpend = 0; } #ifdef INVARIANTS if (ring == PMC_HR) KASSERT(ncallchains > 0 || nfree > 0, ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, cpu)); #endif /* mark CPU as needing processing */ DPCPU_SET(pmc_sampled, 1); } static void pmc_flush_ring(int cpu, int ring) { struct pmc *pm; struct pmc_sample *ps; struct pmc_samplebuffer *psb; int n; psb = pmc_pcpu[cpu]->pc_sb[ring]; for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ ps = psb->ps_read; if (ps->ps_nsamples == PMC_SAMPLE_FREE) goto next; pm = ps->ps_pmc; counter_u64_add(pm->pm_runcount, -1); ps->ps_nsamples = PMC_SAMPLE_FREE; /* increment read pointer, modulo sample size */ next: if (++ps == psb->ps_fence) psb->ps_read = psb->ps_samples; else psb->ps_read = ps; } } void pmc_flush_samples(int cpu) { int n; for (n = 0; n < PMC_NUM_SR; n++) pmc_flush_ring(cpu, n); } /* * Process saved PC samples. */ static void pmc_process_samples(int cpu, int ring) { struct pmc *pm; int adjri, n; struct thread *td; struct pmc_owner *po; struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *psb; KASSERT(PCPU_GET(cpuid) == cpu, ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__, PCPU_GET(cpuid), cpu)); psb = pmc_pcpu[cpu]->pc_sb[ring]; for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ ps = psb->ps_read; if (ps->ps_nsamples == PMC_SAMPLE_FREE) break; pm = ps->ps_pmc; KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); po = pm->pm_owner; KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__, pm, PMC_TO_MODE(pm))); /* Ignore PMCs that have been switched off */ if (pm->pm_state != PMC_STATE_RUNNING) goto entrydone; /* If there is a pending AST wait for completion */ if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { /* Need a rescan at a later time. */ DPCPU_SET(pmc_sampled, 1); break; } PMCDBG6(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, pm, ps->ps_nsamples, ps->ps_flags, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); /* * If this is a process-mode PMC that is attached to * its owner, and if the PC is in user mode, update * profiling statistics like timer-based profiling * would have done. * * Otherwise, this is either a sampling-mode PMC that * is attached to a different process than its owner, * or a system-wide sampling PMC. Dispatch a log * entry to the PMC's owner process. */ if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { if (ps->ps_flags & PMC_CC_F_USERSPACE) { td = FIRST_THREAD_IN_PROC(po->po_owner); addupc_intr(td, ps->ps_pc[0], 1); } } else pmclog_process_callchain(pm, ps); entrydone: ps->ps_nsamples = 0; /* mark entry as free */ counter_u64_add(pm->pm_runcount, -1); /* increment read pointer, modulo sample size */ if (++ps == psb->ps_fence) psb->ps_read = psb->ps_samples; else psb->ps_read = ps; } counter_u64_add(pmc_stats.pm_log_sweeps, 1); /* Do not re-enable stalled PMCs if we failed to process any samples */ if (n == 0) return; /* * Restart any stalled sampling PMCs on this CPU. * * If the NMI handler sets the pm_stalled field of a PMC after * the check below, we'll end up processing the stalled PMC at * the next hardclock tick. */ for (n = 0; n < md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); KASSERT(pcd != NULL, ("[pmc,%d] null pcd ri=%d", __LINE__, n)); (void) (*pcd->pcd_get_config)(cpu,adjri,&pm); if (pm == NULL || /* !cfg'ed */ pm->pm_state != PMC_STATE_RUNNING || /* !active */ !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */ !pm->pm_pcpu_state[cpu].pps_cpustate || /* !desired */ !pm->pm_pcpu_state[cpu].pps_stalled) /* !stalled */ continue; pm->pm_pcpu_state[cpu].pps_stalled = 0; (*pcd->pcd_start_pmc)(cpu, adjri); } } /* * Event handlers. */ /* * Handle a process exit. * * Remove this process from all hash tables. If this process * owned any PMCs, turn off those PMCs and deallocate them, * removing any associations with target processes. * * This function will be called by the last 'thread' of a * process. * * XXX This eventhandler gets called early in the exit process. * Consider using a 'hook' invocation from thread_exit() or equivalent * spot. Another negative is that kse_exit doesn't seem to call * exit1() [??]. * */ static void pmc_process_exit(void *arg __unused, struct proc *p) { struct pmc *pm; int adjri, cpu; unsigned int ri; int is_using_hwpmcs; struct pmc_owner *po; struct pmc_process *pp; struct pmc_classdep *pcd; pmc_value_t newvalue, tmp; PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); /* * Log a sysexit event to all SS PMC owners. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_sysexit(po, p->p_pid); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG3(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); /* * Since this code is invoked by the last thread in an exiting * process, we would have context switched IN at some prior * point. However, with PREEMPTION, kernel mode context * switches may happen any time, so we want to disable a * context switch OUT till we get any PMCs targeting this * process off the hardware. * * We also need to atomically remove this process' * entry from our target process hash table, using * PMC_FLAG_REMOVE. */ PMCDBG3(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); critical_enter(); /* no preemption */ cpu = curthread->td_oncpu; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_REMOVE)) != NULL) { PMCDBG2(PRC,EXT,2, "process-exit proc=%p pmc-process=%p", p, pp); /* * The exiting process could the target of * some PMCs which will be running on * currently executing CPU. * * We need to turn these PMCs off like we * would do at context switch OUT time. */ for (ri = 0; ri < md->pmd_npmc; ri++) { /* * Pick up the pmc pointer from hardware * state similar to the CSW_OUT code. */ pm = NULL; pcd = pmc_ri_to_classdep(md, ri, &adjri); (void) (*pcd->pcd_get_config)(cpu, adjri, &pm); PMCDBG2(PRC,EXT,2, "ri=%d pm=%p", ri, pm); if (pm == NULL || !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) continue; PMCDBG4(PRC,EXT,2, "ppmcs[%d]=%p pm=%p " "state=%d", ri, pp->pp_pmcs[ri].pp_pmc, pm, pm->pm_state); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] bad runcount ri %d rc %ld", __LINE__, ri, (unsigned long)counter_u64_fetch(pm->pm_runcount))); /* * Change desired state, and then stop if not * stalled. This two-step dance should avoid * race conditions where an interrupt re-enables * the PMC after this code has already checked * the pm_stalled flag. */ if (pm->pm_pcpu_state[cpu].pps_cpustate) { pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (!pm->pm_pcpu_state[cpu].pps_stalled) { (void) pcd->pcd_stop_pmc(cpu, adjri); if (PMC_TO_MODE(pm) == PMC_MODE_TC) { pcd->pcd_read_pmc(cpu, adjri, &newvalue); tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin( pmc_mtxpool, pm); } } } counter_u64_add(pm->pm_runcount, -1); KASSERT((int) counter_u64_fetch(pm->pm_runcount) >= 0, ("[pmc,%d] runcount is %d", __LINE__, ri)); (void) pcd->pcd_config_pmc(cpu, adjri, NULL); } /* * Inform the MD layer of this pseudo "context switch * out" */ (void) md->pmd_switch_out(pmc_pcpu[cpu], pp); critical_exit(); /* ok to be pre-empted now */ /* * Unlink this process from the PMCs that are * targeting it. This will send a signal to * all PMC owner's whose PMCs are orphaned. * * Log PMC value at exit time if requested. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm))) pmclog_process_procexit(pm, pp); pmc_unlink_target_process(pm, pp); } free(pp, M_PMC); } else critical_exit(); /* pp == NULL */ /* * If the process owned PMCs, free them up and free up * memory. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } sx_xunlock(&pmc_sx); } /* * Handle a process fork. * * If the parent process 'p1' is under HWPMC monitoring, then copy * over any attached PMCs that have 'do_descendants' semantics. */ static void pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc, int flags) { int is_using_hwpmcs; unsigned int ri; uint32_t do_descendants; struct pmc *pm; struct pmc_owner *po; struct pmc_process *ppnew, *ppold; (void) flags; /* unused parameter */ PROC_LOCK(p1); is_using_hwpmcs = p1->p_flag & P_HWPMC; PROC_UNLOCK(p1); /* * If there are system-wide sampling PMCs active, we need to * log all fork events to their owner's logs. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); pmclog_process_proccreate(po, newproc, 1); } - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG4(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1, p1->p_pid, p1->p_comm, newproc); /* * If the parent process (curthread->td_proc) is a * target of any PMCs, look for PMCs that are to be * inherited, and link these into the new process * descriptor. */ if ((ppold = pmc_find_process_descriptor(curthread->td_proc, PMC_FLAG_NONE)) == NULL) goto done; /* nothing to do */ do_descendants = 0; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL) do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS; if (do_descendants == 0) /* nothing to do */ goto done; /* * Now mark the new process as being tracked by this driver. */ PROC_LOCK(newproc); newproc->p_flag |= P_HWPMC; PROC_UNLOCK(newproc); /* allocate a descriptor for the new process */ if ((ppnew = pmc_find_process_descriptor(newproc, PMC_FLAG_ALLOCATE)) == NULL) goto done; /* * Run through all PMCs that were targeting the old process * and which specified F_DESCENDANTS and attach them to the * new process. * * Log the fork event to all owners of PMCs attached to this * process, if not already logged. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL && (pm->pm_flags & PMC_F_DESCENDANTS)) { pmc_link_target_process(pm, ppnew); po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); } done: sx_xunlock(&pmc_sx); } static void pmc_process_threadcreate(struct thread *td) { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadcreate(po, td, 1); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void pmc_process_threadexit(struct thread *td) { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadexit(po, td); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void pmc_process_proccreate(struct proc *p) { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_proccreate(po, p, 1 /* sync */); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void pmc_process_allproc(struct pmc *pm) { struct pmc_owner *po; struct thread *td; struct proc *p; po = pm->pm_owner; if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { pmclog_process_proccreate(po, p, 0 /* sync */); PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td) pmclog_process_threadcreate(po, td, 0 /* sync */); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pmclog_flush(po, 0); } static void pmc_kld_load(void *arg __unused, linker_file_t lf) { struct pmc_owner *po; /* * Notify owners of system sampling PMCs about KLD operations. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, (pid_t) -1, (uintfptr_t) lf->address, lf->filename); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); /* * TODO: Notify owners of (all) process-sampling PMCs too. */ } static void pmc_kld_unload(void *arg __unused, const char *filename __unused, caddr_t address, size_t size) { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, (pid_t) -1, (uintfptr_t) address, (uintfptr_t) address + size); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); /* * TODO: Notify owners of process-sampling PMCs. */ } /* * initialization */ static const char * pmc_name_of_pmcclass(enum pmc_class class) { switch (class) { #undef __PMC_CLASS #define __PMC_CLASS(S,V,D) \ case PMC_CLASS_##S: \ return #S; __PMC_CLASSES(); default: return (""); } } /* * Base class initializer: allocate structure and set default classes. */ struct pmc_mdep * pmc_mdep_alloc(int nclasses) { struct pmc_mdep *md; int n; /* SOFT + md classes */ n = 1 + nclasses; md = malloc(sizeof(struct pmc_mdep) + n * sizeof(struct pmc_classdep), M_PMC, M_WAITOK|M_ZERO); md->pmd_nclass = n; /* Add base class. */ pmc_soft_initialize(md); return md; } void pmc_mdep_free(struct pmc_mdep *md) { pmc_soft_finalize(md); free(md, M_PMC); } static int generic_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; return (0); } static int generic_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; return (0); } static struct pmc_mdep * pmc_generic_cpu_initialize(void) { struct pmc_mdep *md; md = pmc_mdep_alloc(0); md->pmd_cputype = PMC_CPU_GENERIC; md->pmd_pcpu_init = NULL; md->pmd_pcpu_fini = NULL; md->pmd_switch_in = generic_switch_in; md->pmd_switch_out = generic_switch_out; return (md); } static void pmc_generic_cpu_finalize(struct pmc_mdep *md) { (void) md; } static int pmc_initialize(void) { int c, cpu, error, n, ri; unsigned int maxcpu, domain; struct pcpu *pc; struct pmc_binding pb; struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *sb; md = NULL; error = 0; pmc_stats.pm_intr_ignored = counter_u64_alloc(M_WAITOK); pmc_stats.pm_intr_processed = counter_u64_alloc(M_WAITOK); pmc_stats.pm_intr_bufferfull = counter_u64_alloc(M_WAITOK); pmc_stats.pm_syscalls = counter_u64_alloc(M_WAITOK); pmc_stats.pm_syscall_errors = counter_u64_alloc(M_WAITOK); pmc_stats.pm_buffer_requests = counter_u64_alloc(M_WAITOK); pmc_stats.pm_buffer_requests_failed = counter_u64_alloc(M_WAITOK); pmc_stats.pm_log_sweeps = counter_u64_alloc(M_WAITOK); pmc_stats.pm_merges = counter_u64_alloc(M_WAITOK); pmc_stats.pm_overwrites = counter_u64_alloc(M_WAITOK); #ifdef HWPMC_DEBUG /* parse debug flags first */ if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr))) pmc_debugflags_parse(pmc_debugstr, pmc_debugstr+strlen(pmc_debugstr)); #endif PMCDBG1(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION); /* check kernel version */ if (pmc_kernel_version != PMC_VERSION) { if (pmc_kernel_version == 0) printf("hwpmc: this kernel has not been compiled with " "'options HWPMC_HOOKS'.\n"); else printf("hwpmc: kernel version (0x%x) does not match " "module version (0x%x).\n", pmc_kernel_version, PMC_VERSION); return EPROGMISMATCH; } /* * check sysctl parameters */ if (pmc_hashsize <= 0) { (void) printf("hwpmc: tunable \"hashsize\"=%d must be " "greater than zero.\n", pmc_hashsize); pmc_hashsize = PMC_HASH_SIZE; } if (pmc_nsamples <= 0 || pmc_nsamples > 65535) { (void) printf("hwpmc: tunable \"nsamples\"=%d out of " "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } if (pmc_callchaindepth <= 0 || pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { (void) printf("hwpmc: tunable \"callchaindepth\"=%d out of " "range - using %d.\n", pmc_callchaindepth, PMC_CALLCHAIN_DEPTH_MAX); pmc_callchaindepth = PMC_CALLCHAIN_DEPTH_MAX; } md = pmc_md_initialize(); if (md == NULL) { /* Default to generic CPU. */ md = pmc_generic_cpu_initialize(); if (md == NULL) return (ENOSYS); } KASSERT(md->pmd_nclass >= 1 && md->pmd_npmc >= 1, ("[pmc,%d] no classes or pmcs", __LINE__)); /* Compute the map from row-indices to classdep pointers. */ pmc_rowindex_to_classdep = malloc(sizeof(struct pmc_classdep *) * md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO); for (n = 0; n < md->pmd_npmc; n++) pmc_rowindex_to_classdep[n] = NULL; for (ri = c = 0; c < md->pmd_nclass; c++) { pcd = &md->pmd_classdep[c]; for (n = 0; n < pcd->pcd_num; n++, ri++) pmc_rowindex_to_classdep[ri] = pcd; } KASSERT(ri == md->pmd_npmc, ("[pmc,%d] npmc miscomputed: ri=%d, md->npmc=%d", __LINE__, ri, md->pmd_npmc)); maxcpu = pmc_cpu_max(); /* allocate space for the per-cpu array */ pmc_pcpu = malloc(maxcpu * sizeof(struct pmc_cpu *), M_PMC, M_WAITOK|M_ZERO); /* per-cpu 'saved values' for managing process-mode PMCs */ pmc_pcpu_saved = malloc(sizeof(pmc_value_t) * maxcpu * md->pmd_npmc, M_PMC, M_WAITOK); /* Perform CPU-dependent initialization. */ pmc_save_cpu_binding(&pb); error = 0; for (cpu = 0; error == 0 && cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; pmc_select_cpu(cpu); pmc_pcpu[cpu] = malloc(sizeof(struct pmc_cpu) + md->pmd_npmc * sizeof(struct pmc_hw *), M_PMC, M_WAITOK|M_ZERO); if (md->pmd_pcpu_init) error = md->pmd_pcpu_init(md, cpu); for (n = 0; error == 0 && n < md->pmd_nclass; n++) error = md->pmd_classdep[n].pcd_pcpu_init(md, cpu); } pmc_restore_cpu_binding(&pb); if (error) return (error); /* allocate space for the sample array */ for (cpu = 0; cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; pc = pcpu_find(cpu); domain = pc->pc_domain; sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); sb->ps_read = sb->ps_write = sb->ps_samples; sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); sb->ps_callchains = malloc_domain(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, domain, M_WAITOK|M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_HR] = sb; sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); sb->ps_read = sb->ps_write = sb->ps_samples; sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); sb->ps_callchains = malloc_domain(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, domain, M_WAITOK|M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_SR] = sb; sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); sb->ps_read = sb->ps_write = sb->ps_samples; sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); sb->ps_callchains = malloc_domain(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, domain, M_WAITOK|M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_UR] = sb; } /* allocate space for the row disposition array */ pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO); /* mark all PMCs as available */ for (n = 0; n < (int) md->pmd_npmc; n++) PMC_MARK_ROW_FREE(n); /* allocate thread hash tables */ pmc_ownerhash = hashinit(pmc_hashsize, M_PMC, &pmc_ownerhashmask); pmc_processhash = hashinit(pmc_hashsize, M_PMC, &pmc_processhashmask); mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf", MTX_SPIN); CK_LIST_INIT(&pmc_ss_owners); pmc_ss_count = 0; /* allocate a pool of spin mutexes */ pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size, MTX_SPIN); PMCDBG4(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx " "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask, pmc_processhash, pmc_processhashmask); /* Initialize a spin mutex for the thread free list. */ mtx_init(&pmc_threadfreelist_mtx, "pmc-threadfreelist", "pmc-leaf", MTX_SPIN); /* * Initialize the callout to monitor the thread free list. * This callout will also handle the initial population of the list. */ taskqgroup_config_gtask_init(NULL, &free_gtask, pmc_thread_descriptor_pool_free_task, "thread descriptor pool free task"); /* register process {exit,fork,exec} handlers */ pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit, pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY); pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork, pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY); /* register kld event handlers */ pmc_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, pmc_kld_load, NULL, EVENTHANDLER_PRI_ANY); pmc_kld_unload_tag = EVENTHANDLER_REGISTER(kld_unload, pmc_kld_unload, NULL, EVENTHANDLER_PRI_ANY); /* initialize logging */ pmclog_initialize(); /* set hook functions */ pmc_intr = md->pmd_intr; wmb(); pmc_hook = pmc_hook_handler; if (error == 0) { printf(PMC_MODULE_NAME ":"); for (n = 0; n < (int) md->pmd_nclass; n++) { pcd = &md->pmd_classdep[n]; printf(" %s/%d/%d/0x%b", pmc_name_of_pmcclass(pcd->pcd_class), pcd->pcd_num, pcd->pcd_width, pcd->pcd_caps, "\20" "\1INT\2USR\3SYS\4EDG\5THR" "\6REA\7WRI\10INV\11QUA\12PRC" "\13TAG\14CSC"); } printf("\n"); } return (error); } /* prepare to be unloaded */ static void pmc_cleanup(void) { int c, cpu; unsigned int maxcpu; struct pmc_ownerhash *ph; struct pmc_owner *po, *tmp; struct pmc_binding pb; #ifdef HWPMC_DEBUG struct pmc_processhash *prh; #endif PMCDBG0(MOD,INI,0, "cleanup"); /* switch off sampling */ CPU_FOREACH(cpu) DPCPU_ID_SET(cpu, pmc_sampled, 0); pmc_intr = NULL; sx_xlock(&pmc_sx); if (pmc_hook == NULL) { /* being unloaded already */ sx_xunlock(&pmc_sx); return; } pmc_hook = NULL; /* prevent new threads from entering module */ /* deregister event handlers */ EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag); EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag); EVENTHANDLER_DEREGISTER(kld_load, pmc_kld_load_tag); EVENTHANDLER_DEREGISTER(kld_unload, pmc_kld_unload_tag); /* send SIGBUS to all owner threads, free up allocations */ if (pmc_ownerhash) for (ph = pmc_ownerhash; ph <= &pmc_ownerhash[pmc_ownerhashmask]; ph++) { LIST_FOREACH_SAFE(po, ph, po_next, tmp) { pmc_remove_owner(po); /* send SIGBUS to owner processes */ PMCDBG3(MOD,INI,2, "cleanup signal proc=%p " "(%d, %s)", po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); PROC_LOCK(po->po_owner); kern_psignal(po->po_owner, SIGBUS); PROC_UNLOCK(po->po_owner); pmc_destroy_owner_descriptor(po); } } /* reclaim allocated data structures */ mtx_destroy(&pmc_threadfreelist_mtx); pmc_thread_descriptor_pool_drain(); if (pmc_mtxpool) mtx_pool_destroy(&pmc_mtxpool); mtx_destroy(&pmc_processhash_mtx); taskqgroup_config_gtask_deinit(&free_gtask); if (pmc_processhash) { #ifdef HWPMC_DEBUG struct pmc_process *pp; PMCDBG0(MOD,INI,3, "destroy process hash"); for (prh = pmc_processhash; prh <= &pmc_processhash[pmc_processhashmask]; prh++) LIST_FOREACH(pp, prh, pp_next) PMCDBG1(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid); #endif hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask); pmc_processhash = NULL; } if (pmc_ownerhash) { PMCDBG0(MOD,INI,3, "destroy owner hash"); hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask); pmc_ownerhash = NULL; } KASSERT(CK_LIST_EMPTY(&pmc_ss_owners), ("[pmc,%d] Global SS owner list not empty", __LINE__)); KASSERT(pmc_ss_count == 0, ("[pmc,%d] Global SS count not empty", __LINE__)); /* do processor and pmc-class dependent cleanup */ maxcpu = pmc_cpu_max(); PMCDBG0(MOD,INI,3, "md cleanup"); if (md) { pmc_save_cpu_binding(&pb); for (cpu = 0; cpu < maxcpu; cpu++) { PMCDBG2(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p", cpu, pmc_pcpu[cpu]); if (!pmc_cpu_is_active(cpu) || pmc_pcpu[cpu] == NULL) continue; pmc_select_cpu(cpu); for (c = 0; c < md->pmd_nclass; c++) md->pmd_classdep[c].pcd_pcpu_fini(md, cpu); if (md->pmd_pcpu_fini) md->pmd_pcpu_fini(md, cpu); } if (md->pmd_cputype == PMC_CPU_GENERIC) pmc_generic_cpu_finalize(md); else pmc_md_finalize(md); pmc_mdep_free(md); md = NULL; pmc_restore_cpu_binding(&pb); } /* Free per-cpu descriptors. */ for (cpu = 0; cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_HR] != NULL, ("[pmc,%d] Null hw cpu sample buffer cpu=%d", __LINE__, cpu)); KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_SR] != NULL, ("[pmc,%d] Null sw cpu sample buffer cpu=%d", __LINE__, cpu)); KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_UR] != NULL, ("[pmc,%d] Null userret cpu sample buffer cpu=%d", __LINE__, cpu)); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_HR]->ps_callchains, M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_HR], M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_SR]->ps_callchains, M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_SR], M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_UR]->ps_callchains, M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_UR], M_PMC); free_domain(pmc_pcpu[cpu], M_PMC); } free(pmc_pcpu, M_PMC); pmc_pcpu = NULL; free(pmc_pcpu_saved, M_PMC); pmc_pcpu_saved = NULL; if (pmc_pmcdisp) { free(pmc_pmcdisp, M_PMC); pmc_pmcdisp = NULL; } if (pmc_rowindex_to_classdep) { free(pmc_rowindex_to_classdep, M_PMC); pmc_rowindex_to_classdep = NULL; } pmclog_shutdown(); counter_u64_free(pmc_stats.pm_intr_ignored); counter_u64_free(pmc_stats.pm_intr_processed); counter_u64_free(pmc_stats.pm_intr_bufferfull); counter_u64_free(pmc_stats.pm_syscalls); counter_u64_free(pmc_stats.pm_syscall_errors); counter_u64_free(pmc_stats.pm_buffer_requests); counter_u64_free(pmc_stats.pm_buffer_requests_failed); counter_u64_free(pmc_stats.pm_log_sweeps); counter_u64_free(pmc_stats.pm_merges); counter_u64_free(pmc_stats.pm_overwrites); sx_xunlock(&pmc_sx); /* we are done */ } /* * The function called at load/unload. */ static int load (struct module *module __unused, int cmd, void *arg __unused) { int error; error = 0; switch (cmd) { case MOD_LOAD : /* initialize the subsystem */ error = pmc_initialize(); if (error != 0) break; PMCDBG2(MOD,INI,1, "syscall=%d maxcpu=%d", pmc_syscall_num, pmc_cpu_max()); break; case MOD_UNLOAD : case MOD_SHUTDOWN: pmc_cleanup(); PMCDBG0(MOD,INI,1, "unloaded"); break; default : error = EINVAL; /* XXX should panic(9) */ break; } return error; } Index: head/sys/kern/subr_epoch.c =================================================================== --- head/sys/kern/subr_epoch.c (revision 335923) +++ head/sys/kern/subr_epoch.c (revision 335924) @@ -1,638 +1,587 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); /* arbitrary --- needs benchmarking */ #define MAX_ADAPTIVE_SPIN 1000 #define MAX_EPOCHS 64 -#ifdef __amd64__ -#define EPOCH_ALIGN CACHE_LINE_SIZE*2 -#else -#define EPOCH_ALIGN CACHE_LINE_SIZE -#endif - -CTASSERT(sizeof(epoch_section_t) == sizeof(ck_epoch_section_t)); CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); - /* Stats. */ static counter_u64_t block_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, &block_count, "# of times a thread was in an epoch when epoch_wait was called"); static counter_u64_t migrate_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); static counter_u64_t turnstile_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); static counter_u64_t switch_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); static counter_u64_t epoch_call_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, &epoch_call_count, "# of times a callback was deferred"); static counter_u64_t epoch_call_task_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, &epoch_call_task_count, "# of times a callback task was run"); TAILQ_HEAD (threadlist, thread); CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) -typedef struct epoch_record { - ck_epoch_record_t er_record; - volatile struct threadlist er_tdlist; - volatile uint32_t er_gen; - uint32_t er_cpuid; -} *epoch_record_t; -struct epoch_pcpu_state { - struct epoch_record eps_record; -} __aligned(EPOCH_ALIGN); + epoch_t allepochs[MAX_EPOCHS]; -struct epoch { - struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); - struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM] __aligned(EPOCH_ALIGN); - int e_idx; - int e_flags; - struct epoch_pcpu_state *e_pcpu[0]; -}; - -epoch_t allepochs[MAX_EPOCHS]; - DPCPU_DEFINE(struct grouptask, epoch_cb_task); DPCPU_DEFINE(int, epoch_cb_count); static __read_mostly int domcount[MAXMEMDOM]; static __read_mostly int domoffsets[MAXMEMDOM]; static __read_mostly int inited; static __read_mostly int epoch_count; __read_mostly epoch_t global_epoch; __read_mostly epoch_t global_epoch_preempt; static void epoch_call_task(void *context __unused); #if defined(__powerpc64__) || defined(__powerpc__) || !defined(NUMA) static bool usedomains = false; #else static bool usedomains = true; #endif static void epoch_init(void *arg __unused) { int domain, cpu; block_count = counter_u64_alloc(M_WAITOK); migrate_count = counter_u64_alloc(M_WAITOK); turnstile_count = counter_u64_alloc(M_WAITOK); switch_count = counter_u64_alloc(M_WAITOK); epoch_call_count = counter_u64_alloc(M_WAITOK); epoch_call_task_count = counter_u64_alloc(M_WAITOK); if (usedomains == false) goto done; domain = 0; domoffsets[0] = 0; for (domain = 0; domain < vm_ndomains; domain++) { domcount[domain] = CPU_COUNT(&cpuset_domain[domain]); if (bootverbose) printf("domcount[%d] %d\n", domain, domcount[domain]); } for (domain = 1; domain < vm_ndomains; domain++) domoffsets[domain] = domoffsets[domain - 1] + domcount[domain - 1]; for (domain = 0; domain < vm_ndomains; domain++) { if (domcount[domain] == 0) { usedomains = false; break; } } done: CPU_FOREACH(cpu) { GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, epoch_call_task, NULL); taskqgroup_attach_cpu(qgroup_softirq, DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, -1, "epoch call task"); } inited = 1; global_epoch = epoch_alloc(0); global_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); } SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL); #if !defined(EARLY_AP_STARTUP) static void epoch_init_smp(void *dummy __unused) { inited = 2; } SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); #endif static void epoch_init_numa(epoch_t epoch) { int domain, cpu_offset; - struct epoch_pcpu_state *eps; epoch_record_t er; for (domain = 0; domain < vm_ndomains; domain++) { - eps = malloc_domain(sizeof(*eps) * domcount[domain], M_EPOCH, + er = malloc_domain(sizeof(*er) * domcount[domain], M_EPOCH, domain, M_ZERO | M_WAITOK); - epoch->e_pcpu_dom[domain] = eps; + epoch->e_pcpu_dom[domain] = er; cpu_offset = domoffsets[domain]; - for (int i = 0; i < domcount[domain]; i++, eps++) { - epoch->e_pcpu[cpu_offset + i] = eps; - er = &eps->eps_record; + for (int i = 0; i < domcount[domain]; i++, er++) { + epoch->e_pcpu[cpu_offset + i] = er; ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = cpu_offset + i; } } } static void epoch_init_legacy(epoch_t epoch) { - struct epoch_pcpu_state *eps; epoch_record_t er; - eps = malloc(sizeof(*eps) * mp_ncpus, M_EPOCH, M_ZERO | M_WAITOK); - epoch->e_pcpu_dom[0] = eps; - for (int i = 0; i < mp_ncpus; i++, eps++) { - epoch->e_pcpu[i] = eps; - er = &eps->eps_record; + er = malloc(sizeof(*er) * mp_ncpus, M_EPOCH, M_ZERO | M_WAITOK); + epoch->e_pcpu_dom[0] = er; + for (int i = 0; i < mp_ncpus; i++, er++) { + epoch->e_pcpu[i] = er; ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = i; } } epoch_t epoch_alloc(int flags) { epoch_t epoch; if (__predict_false(!inited)) panic("%s called too early in boot", __func__); epoch = malloc(sizeof(struct epoch) + mp_ncpus * sizeof(void *), M_EPOCH, M_ZERO | M_WAITOK); ck_epoch_init(&epoch->e_epoch); if (usedomains) epoch_init_numa(epoch); else epoch_init_legacy(epoch); MPASS(epoch_count < MAX_EPOCHS - 2); epoch->e_flags = flags; epoch->e_idx = epoch_count; allepochs[epoch_count++] = epoch; return (epoch); } void epoch_free(epoch_t epoch) { int domain; #ifdef INVARIANTS - struct epoch_pcpu_state *eps; + struct epoch_record *er; int cpu; CPU_FOREACH(cpu) { - eps = epoch->e_pcpu[cpu]; - MPASS(TAILQ_EMPTY(&eps->eps_record.er_tdlist)); + er = epoch->e_pcpu[cpu]; + MPASS(TAILQ_EMPTY(&er->er_tdlist)); } #endif allepochs[epoch->e_idx] = NULL; epoch_wait(global_epoch); if (usedomains) for (domain = 0; domain < vm_ndomains; domain++) free_domain(epoch->e_pcpu_dom[domain], M_EPOCH); else free(epoch->e_pcpu_dom[0], M_EPOCH); free(epoch, M_EPOCH); } -#define INIT_CHECK(epoch) \ - do { \ - if (__predict_false((epoch) == NULL)) \ - return; \ - } while (0) - void -epoch_enter_preempt_internal(epoch_t epoch, struct thread *td) +epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et) { - struct epoch_pcpu_state *eps; - MPASS(cold || epoch != NULL); - INIT_CHECK(epoch); - MPASS(epoch->e_flags & EPOCH_PREEMPT); - critical_enter(); - td->td_pre_epoch_prio = td->td_priority; - eps = epoch->e_pcpu[curcpu]; -#ifdef INVARIANTS - MPASS(td->td_epochnest < UCHAR_MAX - 2); - if (td->td_epochnest > 1) { - struct thread *curtd; - int found = 0; - - TAILQ_FOREACH(curtd, &eps->eps_record.er_tdlist, td_epochq) - if (curtd == td) - found = 1; - KASSERT(found, ("recursing on a second epoch")); - critical_exit(); - return; - } -#endif - TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq); - sched_pin(); - ck_epoch_begin(&eps->eps_record.er_record, (ck_epoch_section_t *)&td->td_epoch_section); - critical_exit(); + epoch_enter_preempt(epoch, et); } - void -epoch_enter(epoch_t epoch) +epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et) { - ck_epoch_record_t *record; - struct thread *td; - MPASS(cold || epoch != NULL); - INIT_CHECK(epoch); - td = curthread; - - critical_enter(); - td->td_epochnest++; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; - ck_epoch_begin(record, NULL); + epoch_exit_preempt(epoch, et); } void -epoch_exit_preempt_internal(epoch_t epoch, struct thread *td) +epoch_enter_KBI(epoch_t epoch) { - struct epoch_pcpu_state *eps; - MPASS(td->td_epochnest == 0); - INIT_CHECK(epoch); - critical_enter(); - eps = epoch->e_pcpu[curcpu]; - - MPASS(epoch->e_flags & EPOCH_PREEMPT); - ck_epoch_end(&eps->eps_record.er_record, (ck_epoch_section_t *)&td->td_epoch_section); - TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq); - eps->eps_record.er_gen++; - sched_unpin(); - if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) { - thread_lock(td); - sched_prio(td, td->td_pre_epoch_prio); - thread_unlock(td); - } - critical_exit(); + epoch_enter(epoch); } void -epoch_exit(epoch_t epoch) +epoch_exit_KBI(epoch_t epoch) { - ck_epoch_record_t *record; - struct thread *td; - INIT_CHECK(epoch); - td = curthread; - td->td_epochnest--; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; - ck_epoch_end(record, NULL); - critical_exit(); + epoch_exit(epoch); } /* * epoch_block_handler_preempt is a callback from the ck code when another thread is * currently in an epoch section. */ static void epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr, void *arg __unused) { epoch_record_t record; - struct thread *td, *tdwait, *owner; + struct thread *td, *owner, *curwaittd; + struct epoch_thread *tdwait; struct turnstile *ts; struct lock_object *lock; int spincount, gen; int locksheld __unused; record = __containerof(cr, struct epoch_record, er_record); td = curthread; locksheld = td->td_locks; spincount = 0; counter_u64_add(block_count, 1); if (record->er_cpuid != curcpu) { /* * If the head of the list is running, we can wait for it * to remove itself from the list and thus save us the * overhead of a migration */ if ((tdwait = TAILQ_FIRST(&record->er_tdlist)) != NULL && - TD_IS_RUNNING(tdwait)) { + TD_IS_RUNNING(tdwait->et_td)) { gen = record->er_gen; thread_unlock(td); do { cpu_spinwait(); } while (tdwait == TAILQ_FIRST(&record->er_tdlist) && - gen == record->er_gen && TD_IS_RUNNING(tdwait) && + gen == record->er_gen && TD_IS_RUNNING(tdwait->et_td) && spincount++ < MAX_ADAPTIVE_SPIN); thread_lock(td); return; } /* * Being on the same CPU as that of the record on which * we need to wait allows us access to the thread * list associated with that CPU. We can then examine the * oldest thread in the queue and wait on its turnstile * until it resumes and so on until a grace period * elapses. * */ counter_u64_add(migrate_count, 1); sched_bind(td, record->er_cpuid); /* * At this point we need to return to the ck code * to scan to see if a grace period has elapsed. * We can't move on to check the thread list, because * in the meantime new threads may have arrived that * in fact belong to a different epoch. */ return; } /* * Try to find a thread in an epoch section on this CPU * waiting on a turnstile. Otherwise find the lowest * priority thread (highest prio value) and drop our priority * to match to allow it to run. */ - TAILQ_FOREACH(tdwait, &record->er_tdlist, td_epochq) { + TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { /* * Propagate our priority to any other waiters to prevent us * from starving them. They will have their original priority * restore on exit from epoch_wait(). */ - if (!TD_IS_INHIBITED(tdwait) && tdwait->td_priority > td->td_priority) { + curwaittd = tdwait->et_td; + if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { critical_enter(); thread_unlock(td); - thread_lock(tdwait); - sched_prio(tdwait, td->td_priority); - thread_unlock(tdwait); + thread_lock(curwaittd); + sched_prio(curwaittd, td->td_priority); + thread_unlock(curwaittd); thread_lock(td); critical_exit(); } - if (TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait) && - ((ts = tdwait->td_blocked) != NULL)) { + if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && + ((ts = curwaittd->td_blocked) != NULL)) { /* * We unlock td to allow turnstile_wait to reacquire the * the thread lock. Before unlocking it we enter a critical * section to prevent preemption after we reenable interrupts - * by dropping the thread lock in order to prevent tdwait + * by dropping the thread lock in order to prevent curwaittd * from getting to run. */ critical_enter(); thread_unlock(td); owner = turnstile_lock(ts, &lock); /* * The owner pointer indicates that the lock succeeded. Only * in case we hold the lock and the turnstile we locked is still - * the one that tdwait is blocked on can we continue. Otherwise + * the one that curwaittd is blocked on can we continue. Otherwise * The turnstile pointer has been changed out from underneath - * us, as in the case where the lock holder has signalled tdwait, + * us, as in the case where the lock holder has signalled curwaittd, * and we need to continue. */ - if (owner != NULL && ts == tdwait->td_blocked) { - MPASS(TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait)); + if (owner != NULL && ts == curwaittd->td_blocked) { + MPASS(TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd)); critical_exit(); - turnstile_wait(ts, owner, tdwait->td_tsqueue); + turnstile_wait(ts, owner, curwaittd->td_tsqueue); counter_u64_add(turnstile_count, 1); thread_lock(td); return; } else if (owner != NULL) turnstile_unlock(ts, lock); thread_lock(td); critical_exit(); KASSERT(td->td_locks == locksheld, ("%d extra locks held", td->td_locks - locksheld)); } } /* * We didn't find any threads actually blocked on a lock * so we have nothing to do except context switch away. */ counter_u64_add(switch_count, 1); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); /* * Release the thread lock while yielding to * allow other threads to acquire the lock * pointed to by TDQ_LOCKPTR(td). Else a * deadlock like situation might happen. (HPS) */ thread_unlock(td); thread_lock(td); } void epoch_wait_preempt(epoch_t epoch) { struct thread *td; int was_bound; int old_cpu; int old_pinned; u_char old_prio; int locks __unused; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); td = curthread; #ifdef INVARIANTS locks = curthread->td_locks; MPASS(epoch->e_flags & EPOCH_PREEMPT); if ((epoch->e_flags & EPOCH_LOCKED) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_wait() can be long running"); KASSERT(td->td_epochnest == 0, ("epoch_wait() in the middle of an epoch section")); #endif thread_lock(td); DROP_GIANT(); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; old_prio = td->td_priority; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; sched_bind(td, old_cpu); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL); /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; /* restore thread priority */ sched_prio(td, old_prio); thread_unlock(td); PICKUP_GIANT(); KASSERT(td->td_locks == locks, ("%d residual locks held", td->td_locks - locks)); } static void epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, void *arg __unused) { cpu_spinwait(); } void epoch_wait(epoch_t epoch) { MPASS(cold || epoch != NULL); INIT_CHECK(epoch); MPASS(epoch->e_flags == 0); critical_enter(); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); critical_exit(); } void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) { - struct epoch_pcpu_state *eps; + epoch_record_t er; ck_epoch_entry_t *cb; cb = (void *)ctx; MPASS(callback); /* too early in boot to have epoch set up */ if (__predict_false(epoch == NULL)) goto boottime; #if !defined(EARLY_AP_STARTUP) if (__predict_false(inited < 2)) goto boottime; #endif critical_enter(); *DPCPU_PTR(epoch_cb_count) += 1; - eps = epoch->e_pcpu[curcpu]; - ck_epoch_call(&eps->eps_record.er_record, cb, (ck_epoch_cb_t *)callback); + er = epoch->e_pcpu[curcpu]; + ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); critical_exit(); return; boottime: callback(ctx); } static void epoch_call_task(void *arg __unused) { ck_stack_entry_t *cursor, *head, *next; ck_epoch_record_t *record; epoch_t epoch; ck_stack_t cb_stack; int i, npending, total; ck_stack_init(&cb_stack); critical_enter(); epoch_enter(global_epoch); for (total = i = 0; i < epoch_count; i++) { if (__predict_false((epoch = allepochs[i]) == NULL)) continue; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; + record = &epoch->e_pcpu[curcpu]->er_record; if ((npending = record->n_pending) == 0) continue; ck_epoch_poll_deferred(record, &cb_stack); total += npending - record->n_pending; } epoch_exit(global_epoch); *DPCPU_PTR(epoch_cb_count) -= total; critical_exit(); counter_u64_add(epoch_call_count, total); counter_u64_add(epoch_call_task_count, 1); head = ck_stack_batch_pop_npsc(&cb_stack); for (cursor = head; cursor != NULL; cursor = next) { struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor); next = CK_STACK_NEXT(cursor); entry->function(entry); } } int -in_epoch(void) +in_epoch_verbose(epoch_t epoch, int dump_onfail) { - return (curthread->td_epochnest != 0); + struct epoch_thread *tdwait; + struct thread *td; + epoch_record_t er; + + td = curthread; + if (td->td_epochnest == 0) + return (0); + if (__predict_false((epoch) == NULL)) + return (0); + critical_enter(); + er = epoch->e_pcpu[curcpu]; + TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) + if (tdwait->et_td == td) { + critical_exit(); + return (1); + } +#ifdef INVARIANTS + if (dump_onfail) { + MPASS(td->td_pinned); + printf("cpu: %d id: %d\n", curcpu, td->td_tid); + TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) + printf("td_tid: %d ", tdwait->et_td->td_tid); + printf("\n"); + } +#endif + critical_exit(); + return (0); +} + +int +in_epoch(epoch_t epoch) +{ + return (in_epoch_verbose(epoch, 0)); +} + +void +epoch_adjust_prio(struct thread *td, u_char prio) +{ + thread_lock(td); + sched_prio(td, prio); + thread_unlock(td); } Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 335923) +++ head/sys/net/if.c (revision 335924) @@ -1,4535 +1,4541 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; __read_mostly epoch_t net_epoch; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ case _IOC_NEWTYPE((cmd), struct ifgroupreq32): #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ case (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; ifp = ifnet_byindex_locked(idx); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; void *old; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); net_epoch = epoch_alloc(0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; NET_EPOCH_ENTER(); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } NET_EPOCH_EXIT(); } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ free(ifp->if_hw_addr, M_IFADDR); ifp->if_hw_addr = NULL; ifp->if_addr = NULL; /* We can now free link ifaddr. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; void *old; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; int freeifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } freeifgl = 0; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; int ifglfree; IFNET_WLOCK(); while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { ifgl = CK_STAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); ifglfree = 0; if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); ifglfree = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(ifgm, M_TEMP); if (ifglfree) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { - - IF_ADDR_RLOCK(ifp); + MPASS(*(uint64_t *)&ifp->if_addr_et == 0); + epoch_enter_preempt(net_epoch_preempt, &ifp->if_addr_et); } void if_addr_runlock(struct ifnet *ifp) { - - IF_ADDR_RUNLOCK(ifp); + epoch_exit_preempt(net_epoch_preempt, &ifp->if_addr_et); +#ifdef INVARIANTS + bzero(&ifp->if_addr_et, sizeof(struct epoch_tracker)); +#endif } void if_maddr_rlock(if_t ifp) { - IF_ADDR_RLOCK((struct ifnet *)ifp); + MPASS(*(uint64_t *)&ifp->if_maddr_et == 0); + epoch_enter_preempt(net_epoch_preempt, &ifp->if_maddr_et); } void if_maddr_runlock(if_t ifp) { - IF_ADDR_RUNLOCK((struct ifnet *)ifp); + epoch_exit_preempt(net_epoch_preempt, &ifp->if_maddr_et); +#ifdef INVARIANTS + bzero(&ifp->if_maddr_et, sizeof(struct epoch_tracker)); +#endif } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { int rc; NET_EPOCH_ENTER(); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { - IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); - MPASS(in_epoch()); + + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) return; NET_EPOCH_ENTER(); ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; if (oifa != ifa) { ifa_free(oifa); ifa_ref(ifa); } rt->rt_ifa = ifa; if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } NET_EPOCH_EXIT(); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; #endif struct ifmediareq *ifmrp; struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifmrp = NULL; #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0) CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; int rc; rc = 0; NET_EPOCH_ENTER(); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; goto out; } sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { rc = EINVAL; goto out; } if (len != sdl->sdl_alen) { /* don't allow length to change */ rc = EINVAL; goto out; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: rc = ENODEV; goto out; } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ NET_EPOCH_EXIT(); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); out: NET_EPOCH_EXIT(); return (rc); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_gif.h =================================================================== --- head/sys/net/if_gif.h (revision 335923) +++ head/sys/net/if_gif.h (revision 335924) @@ -1,130 +1,130 @@ /* $FreeBSD$ */ /* $KAME: if_gif.h,v 1.17 2000/09/11 11:36:41 sumikawa Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_IF_GIF_H_ #define _NET_IF_GIF_H_ #ifdef _KERNEL struct ip; struct ip6_hdr; extern void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); extern void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); extern int (*ng_gif_output_p)(struct ifnet *ifp, struct mbuf **mp); extern void (*ng_gif_attach_p)(struct ifnet *ifp); extern void (*ng_gif_detach_p)(struct ifnet *ifp); struct gif_softc { struct ifnet *gif_ifp; int gif_family; int gif_flags; u_int gif_fibnum; u_int gif_options; void *gif_netgraph; /* netgraph node info */ union { void *hdr; struct ip *iphdr; struct ip6_hdr *ip6hdr; } gif_uhdr; CK_LIST_ENTRY(gif_softc) chain; }; CK_LIST_HEAD(gif_list, gif_softc); MALLOC_DECLARE(M_GIF); #ifndef GIF_HASH_SIZE #define GIF_HASH_SIZE (1 << 4) #endif #define GIF2IFP(sc) ((sc)->gif_ifp) #define gif_iphdr gif_uhdr.iphdr #define gif_hdr gif_uhdr.hdr #define gif_ip6hdr gif_uhdr.ip6hdr #define GIF_MTU (1280) /* Default MTU */ #define GIF_MTU_MIN (1280) /* Minimum MTU */ #define GIF_MTU_MAX (8192) /* Maximum MTU */ struct etherip_header { #if BYTE_ORDER == LITTLE_ENDIAN u_int eip_resvl:4, /* reserved */ eip_ver:4; /* version */ #endif #if BYTE_ORDER == BIG_ENDIAN u_int eip_ver:4, /* version */ eip_resvl:4; /* reserved */ #endif u_int8_t eip_resvh; /* reserved */ } __packed; #define ETHERIP_VERSION 0x3 /* mbuf adjust factor to force 32-bit alignment of IP header */ #define ETHERIP_ALIGN 2 -#define GIF_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define GIF_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define GIF_RLOCK() struct epoch_tracker gif_et; epoch_enter_preempt(net_epoch_preempt, &gif_et) +#define GIF_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &gif_et) #define GIF_WAIT() epoch_wait_preempt(net_epoch_preempt) /* Prototypes */ struct gif_list *gif_hashinit(void); void gif_hashdestroy(struct gif_list *); void gif_input(struct mbuf *, struct ifnet *, int, uint8_t); int gif_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void in_gif_init(void); void in_gif_uninit(void); int in_gif_output(struct ifnet *, struct mbuf *, int, uint8_t); int in_gif_ioctl(struct gif_softc *, u_long, caddr_t); int in_gif_setopts(struct gif_softc *, u_int); void in6_gif_init(void); void in6_gif_uninit(void); int in6_gif_output(struct ifnet *, struct mbuf *, int, uint8_t); int in6_gif_ioctl(struct gif_softc *, u_long, caddr_t); int in6_gif_setopts(struct gif_softc *, u_int); #endif /* _KERNEL */ #define GIFGOPTS _IOWR('i', 150, struct ifreq) #define GIFSOPTS _IOW('i', 151, struct ifreq) #define GIF_IGNORE_SOURCE 0x0002 #define GIF_OPTMASK (GIF_IGNORE_SOURCE) #endif /* _NET_IF_GIF_H_ */ Index: head/sys/net/if_gre.h =================================================================== --- head/sys/net/if_gre.h (revision 335923) +++ head/sys/net/if_gre.h (revision 335924) @@ -1,145 +1,145 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.h,v 1.13 2003/11/10 08:51:52 wiz Exp $ * $FreeBSD$ */ #ifndef _NET_IF_GRE_H_ #define _NET_IF_GRE_H_ #ifdef _KERNEL /* GRE header according to RFC 2784 and RFC 2890 */ struct grehdr { uint16_t gre_flags; /* GRE flags */ #define GRE_FLAGS_CP 0x8000 /* checksum present */ #define GRE_FLAGS_KP 0x2000 /* key present */ #define GRE_FLAGS_SP 0x1000 /* sequence present */ #define GRE_FLAGS_MASK (GRE_FLAGS_CP|GRE_FLAGS_KP|GRE_FLAGS_SP) uint16_t gre_proto; /* protocol type */ uint32_t gre_opts[0]; /* optional fields */ } __packed; #ifdef INET struct greip { struct ip gi_ip; struct grehdr gi_gre; } __packed; #endif #ifdef INET6 struct greip6 { struct ip6_hdr gi6_ip6; struct grehdr gi6_gre; } __packed; #endif struct gre_softc { struct ifnet *gre_ifp; int gre_family; /* AF of delivery header */ uint32_t gre_iseq; uint32_t gre_oseq; uint32_t gre_key; uint32_t gre_options; u_int gre_fibnum; u_int gre_hlen; /* header size */ union { void *hdr; #ifdef INET struct greip *gihdr; #endif #ifdef INET6 struct greip6 *gi6hdr; #endif } gre_uhdr; CK_LIST_ENTRY(gre_softc) chain; }; CK_LIST_HEAD(gre_list, gre_softc); MALLOC_DECLARE(M_GRE); #ifndef GRE_HASH_SIZE #define GRE_HASH_SIZE (1 << 4) #endif #define GRE2IFP(sc) ((sc)->gre_ifp) -#define GRE_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define GRE_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define GRE_RLOCK() struct epoch_tracker gre_et; epoch_enter_preempt(net_epoch_preempt, &gre_et) +#define GRE_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &gre_et) #define GRE_WAIT() epoch_wait_preempt(net_epoch_preempt) #define gre_hdr gre_uhdr.hdr #define gre_gihdr gre_uhdr.gihdr #define gre_gi6hdr gre_uhdr.gi6hdr #define gre_oip gre_gihdr->gi_ip #define gre_oip6 gre_gi6hdr->gi6_ip6 struct gre_list *gre_hashinit(void); void gre_hashdestroy(struct gre_list *); int gre_input(struct mbuf *, int, int, void *); void gre_updatehdr(struct gre_softc *, struct grehdr *); void in_gre_init(void); void in_gre_uninit(void); void in_gre_setopts(struct gre_softc *, u_long, uint32_t); int in_gre_ioctl(struct gre_softc *, u_long, caddr_t); int in_gre_output(struct mbuf *, int, int); void in6_gre_init(void); void in6_gre_uninit(void); void in6_gre_setopts(struct gre_softc *, u_long, uint32_t); int in6_gre_ioctl(struct gre_softc *, u_long, caddr_t); int in6_gre_output(struct mbuf *, int, int); /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated * into GRE. */ #define ETHERTYPE_WCCP 0x883E #endif /* _KERNEL */ #define GRESADDRS _IOW('i', 101, struct ifreq) #define GRESADDRD _IOW('i', 102, struct ifreq) #define GREGADDRS _IOWR('i', 103, struct ifreq) #define GREGADDRD _IOWR('i', 104, struct ifreq) #define GRESPROTO _IOW('i' , 105, struct ifreq) #define GREGPROTO _IOWR('i', 106, struct ifreq) #define GREGKEY _IOWR('i', 107, struct ifreq) #define GRESKEY _IOW('i', 108, struct ifreq) #define GREGOPTS _IOWR('i', 109, struct ifreq) #define GRESOPTS _IOW('i', 110, struct ifreq) #define GRE_ENABLE_CSUM 0x0001 #define GRE_ENABLE_SEQ 0x0002 #define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) #endif /* _NET_IF_GRE_H_ */ Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 335923) +++ head/sys/net/if_lagg.c (revision 335924) @@ -1,2193 +1,2193 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include -#define LAGG_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) -#define LAGG_RLOCK_ASSERT() MPASS(in_epoch()) -#define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch()) +#define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) +#define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) +#define LAGG_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) +#define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch(net_epoch_preempt)) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) static VNET_DEFINE(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #ifdef RATELIMIT static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static int lagg_setcaps(struct lagg_port *, int cap); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); /* Allow input on any failover links */ static VNET_DEFINE(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ static VNET_DEFINE(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for flowid shift */ static VNET_DEFINE(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } LAGG_SX_INIT(sc); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); CK_SLIST_INIT(&sc->sc_ports); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #ifdef RATELIMIT ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT; #else ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; #endif /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); free(sc, M_DEVBUF); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, ena, pena; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ena &= lp->lp_ifp->if_capenable; ena = (ena == ~0 ? 0 : ena); /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena); ena &= lp->lp_ifp->if_capenable; } } while (pena != ena); /* Get other capabilities from the lagg ports */ cap = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); hwa = (hwa == ~(uint64_t)0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; int error, i; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); /* Allow the first Ethernet member to define the MTU */ if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (EINVAL); } lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); } lagg_setflags(lp, 1); if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ LAGG_RLOCK(); CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index > ifp->if_index)) break; } LAGG_RUNLOCK(); if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_DEVBUF); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, ETHER_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN); } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } LAGG_RUNLOCK(); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ETHER_ADDR_LEN) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); } lagg_proto_init(sc); LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_bkt; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) { if (ro->ro_bkt == 0) sc->sc_bkt = 1; // Minimum 1 packet per iface. else sc->sc_bkt = ro->ro_bkt; } error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ro->ro_opts == 0) break; /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_TIMEOUT: case -LAGG_OPT_LACP_TIMEOUT: valid = lacp = 1; break; default: valid = lacp = 0; break; } LAGG_XLOCK(sc); if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); if_rele(tpif); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCSIFMTU: /* Do not allow the MTU to be directly changed */ error = EINVAL; break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #ifdef RATELIMIT static int lagg_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: lp = lagg_link_active(sc, sc->sc_primary); break; case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || params->hdr.flowtype == M_HASHTYPE_NONE) return (EOPNOTSUPP); p = params->hdr.flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; lp = lagg_link_active(sc, lp); break; case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || params->hdr.flowtype == M_HASHTYPE_NONE) return (EOPNOTSUPP); lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); break; default: return (EOPNOTSUPP); } if (lp == NULL) return (EOPNOTSUPP); ifp = lp->lp_ifp; if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) return (EOPNOTSUPP); /* forward allocation request */ return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_DEVBUF); } return (0); } static int lagg_setcaps(struct lagg_port *lp, int cap) { struct ifreq ifr; if (lp->lp_ifp->if_capenable == cap) return (0); if (lp->lp_ioctl == NULL) return (ENXIO); ifr.ifr_reqcap = cap; return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr)); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); if (error != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } LAGG_RUNLOCK(); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } LAGG_RUNLOCK(); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; LAGG_RUNLOCK(); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; + struct epoch_tracker net_et; /* * Search a port which reports an active link state. */ if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: - LAGG_RLOCK(); + epoch_enter_preempt(net_epoch_preempt, &net_et); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { - LAGG_RUNLOCK(); - rval = lp_next; - goto found; + epoch_exit_preempt(net_epoch_preempt, &net_et); + return (lp_next); } } - LAGG_RUNLOCK(); + epoch_exit_preempt(net_epoch_preempt, &net_et); found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_bkt_count = sc->sc_bkt; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0) sc->sc_bkt_count = sc->sc_bkt; if (sc->sc_bkt > 0) { atomic_subtract_int(&sc->sc_bkt_count, 1); if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt)) p = atomic_fetchadd_32(&sc->sc_seq, 1); else p = sc->sc_seq; } else p = atomic_fetchadd_32(&sc->sc_seq, 1); p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); while (p--) lp = CK_SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } ret = lagg_enqueue(last->lp_ifp, m0); if (ret != 0) errors++; } last = lp; } LAGG_RUNLOCK(); if (last == NULL) { m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (ret != 0) errors++; if (errors == 0) return (ret); return (0); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_DEVBUF); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_RLOCK(); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) return (EINVAL); if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } LAGG_RUNLOCK(); return (0); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } Index: head/sys/net/if_me.c =================================================================== --- head/sys/net/if_me.c (revision 335923) +++ head/sys/net/if_me.c (revision 335924) @@ -1,632 +1,632 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MEMTU (1500 - sizeof(struct mobhdr)) static const char mename[] = "me"; static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP"); /* Minimal forwarding header RFC 2004 */ struct mobhdr { uint8_t mob_proto; /* protocol */ uint8_t mob_flags; /* flags */ #define MOB_FLAGS_SP 0x80 /* source present */ uint16_t mob_csum; /* header checksum */ struct in_addr mob_dst; /* original destination address */ struct in_addr mob_src; /* original source addr (optional) */ } __packed; struct me_softc { struct ifnet *me_ifp; u_int me_fibnum; struct in_addr me_src; struct in_addr me_dst; CK_LIST_ENTRY(me_softc) chain; }; CK_LIST_HEAD(me_list, me_softc); #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) -#define ME_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define ME_RLOCK() struct epoch_tracker me_et; epoch_enter_preempt(net_epoch_preempt, &me_et) +#define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &me_et) #define ME_WAIT() epoch_wait_preempt(net_epoch_preempt) #ifndef ME_HASH_SIZE #define ME_HASH_SIZE (1 << 4) #endif static VNET_DEFINE(struct me_list *, me_hashtbl) = NULL; #define V_me_hashtbl VNET(me_hashtbl) #define ME_HASH(src, dst) (V_me_hashtbl[\ me_hashval((src), (dst)) & (ME_HASH_SIZE - 1)]) static struct sx me_ioctl_sx; SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl"); static int me_clone_create(struct if_clone *, int, caddr_t); static void me_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, me_cloner); #define V_me_cloner VNET(me_cloner) static void me_qflush(struct ifnet *); static int me_transmit(struct ifnet *, struct mbuf *); static int me_ioctl(struct ifnet *, u_long, caddr_t); static int me_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int me_input(struct mbuf *, int, int, void *); static int me_set_tunnel(struct me_softc *, in_addr_t, in_addr_t); static void me_delete_tunnel(struct me_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW, 0, "Minimal Encapsulation for IP (RFC 2004)"); #ifndef MAX_ME_NEST #define MAX_ME_NEST 1 #endif static VNET_DEFINE(int, max_me_nesting) = MAX_ME_NEST; #define V_max_me_nesting VNET(max_me_nesting) SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_me_nesting), 0, "Max nested tunnels"); static uint32_t me_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static struct me_list * me_hashinit(void) { struct me_list *hash; int i; hash = malloc(sizeof(struct me_list) * ME_HASH_SIZE, M_IFME, M_WAITOK); for (i = 0; i < ME_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } static void vnet_me_init(const void *unused __unused) { V_me_cloner = if_clone_simple(mename, me_clone_create, me_clone_destroy, 0); } VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_init, NULL); static void vnet_me_uninit(const void *unused __unused) { if (V_me_hashtbl != NULL) free(V_me_hashtbl, M_IFME); if_clone_detach(V_me_cloner); } VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_uninit, NULL); static int me_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct me_softc *sc; sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO); sc->me_fibnum = curthread->td_proc->p_fibnum; ME2IFP(sc) = if_alloc(IFT_TUNNEL); ME2IFP(sc)->if_softc = sc; if_initname(ME2IFP(sc), mename, unit); ME2IFP(sc)->if_mtu = MEMTU;; ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; ME2IFP(sc)->if_output = me_output; ME2IFP(sc)->if_ioctl = me_ioctl; ME2IFP(sc)->if_transmit = me_transmit; ME2IFP(sc)->if_qflush = me_qflush; ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(ME2IFP(sc)); bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } static void me_clone_destroy(struct ifnet *ifp) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; me_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&me_ioctl_sx); ME_WAIT(); if_free(ifp); free(sc, M_IFME); } static int me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *src, *dst; struct me_softc *sc; int error; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); } sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } error = me_set_tunnel(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); break; case SIOCDIFPHYADDR: me_delete_tunnel(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (!ME_READY(sc)) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); switch (cmd) { case SIOCGIFPSRCADDR: src->sin_addr = sc->me_src; break; case SIOCGIFPDSTADDR: src->sin_addr = sc->me_dst; break; } error = prison_if(curthread->td_ucred, sintosa(src)); if (error != 0) memset(src, 0, sizeof(*src)); break; case SIOCGTUNFIB: ifr->ifr_fib = sc->me_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->me_fibnum = ifr->ifr_fib; break; default: error = EINVAL; break; } end: sx_xunlock(&me_ioctl_sx); return (error); } static int me_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct me_softc *sc; if (V_me_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { if (sc->me_src.s_addr == ip->ip_dst.s_addr && sc->me_dst.s_addr == ip->ip_src.s_addr) { if ((ME2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } static int me_set_tunnel(struct me_softc *sc, in_addr_t src, in_addr_t dst) { struct me_softc *tmp; sx_assert(&me_ioctl_sx, SA_XLOCKED); if (V_me_hashtbl == NULL) V_me_hashtbl = me_hashinit(); if (sc->me_src.s_addr == src && sc->me_dst.s_addr == dst) return (0); CK_LIST_FOREACH(tmp, &ME_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->me_src.s_addr == src && tmp->me_dst.s_addr == dst) return (EADDRNOTAVAIL); } me_delete_tunnel(sc); sc->me_dst.s_addr = dst; sc->me_src.s_addr = src; CK_LIST_INSERT_HEAD(&ME_HASH(src, dst), sc, chain); ME2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ME2IFP(sc), LINK_STATE_UP); return (0); } static void me_delete_tunnel(struct me_softc *sc) { sx_assert(&me_ioctl_sx, SA_XLOCKED); if (ME_READY(sc)) { CK_LIST_REMOVE(sc, chain); ME_WAIT(); sc->me_src.s_addr = 0; sc->me_dst.s_addr = 0; ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ME2IFP(sc), LINK_STATE_DOWN); } } static uint16_t me_in_cksum(uint16_t *p, int nwords) { uint32_t sum = 0; while (nwords-- > 0) sum += *p++; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } static int me_input(struct mbuf *m, int off, int proto, void *arg) { struct me_softc *sc = arg; struct mobhdr *mh; struct ifnet *ifp; struct ip *ip; int hlen; ifp = ME2IFP(sc); /* checks for short packets */ hlen = sizeof(struct mobhdr); if (m->m_pkthdr.len < sizeof(struct ip) + hlen) hlen -= sizeof(struct in_addr); if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) goto drop; mh = (struct mobhdr *)mtodo(m, sizeof(struct ip)); /* check for wrong flags */ if (mh->mob_flags & (~MOB_FLAGS_SP)) { m_freem(m); goto drop; } if (mh->mob_flags) { if (hlen != sizeof(struct mobhdr)) { m_freem(m); goto drop; } } else hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); /* check mobile header checksum */ if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) { m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif ip = mtod(m, struct ip *); ip->ip_dst = mh->mob_dst; ip->ip_p = mh->mob_proto; ip->ip_sum = 0; ip->ip_len = htons(m->m_pkthdr.len - hlen); if (mh->mob_flags) ip->ip_src = mh->mob_src; memmove(mtodo(m, hlen), ip, sizeof(struct ip)); m_adj(m, hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); M_SETFIB(m, ifp->if_fib); hlen = AF_INET; BPF_MTAP2(ifp, &hlen, sizeof(hlen), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(NETISR_IP, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (IPPROTO_DONE); } #define MTAG_ME 1414491977 static int me_check_nesting(struct ifnet *ifp, struct mbuf *m) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, MTAG_ME, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", ifp->if_xname); return (EIO); } count++; } if (count > V_max_me_nesting) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", ifp->if_xname, count); return (EIO); } mtag = m_tag_alloc(MTAG_ME, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } static int me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro __unused) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } static int me_transmit(struct ifnet *ifp, struct mbuf *m) { struct mobhdr mh; struct me_softc *sc; struct ip *ip; uint32_t af; int error, hlen, plen; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif error = ENETDOWN; ME_RLOCK(); sc = ifp->if_softc; if (sc == NULL || !ME_READY(sc) || (ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (error = me_check_nesting(ifp, m) != 0)) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; if (af != AF_INET) { error = EAFNOSUPPORT; m_freem(m); goto drop; } if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto drop; } ip = mtod(m, struct ip *); /* Fragmented datagramms shouldn't be encapsulated */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { error = EINVAL; m_freem(m); goto drop; } mh.mob_proto = ip->ip_p; mh.mob_src = ip->ip_src; mh.mob_dst = ip->ip_dst; if (in_hosteq(sc->me_src, ip->ip_src)) { hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); mh.mob_flags = 0; } else { hlen = sizeof(struct mobhdr); mh.mob_flags = MOB_FLAGS_SP; } BPF_MTAP2(ifp, &af, sizeof(af), m); plen = m->m_pkthdr.len; ip->ip_src = sc->me_src; ip->ip_dst = sc->me_dst; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->me_fibnum); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) { error = ENOBUFS; goto drop; } memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip)); ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_p = IPPROTO_MOBILE; ip->ip_sum = 0; mh.mob_csum = 0; mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t)); bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } ME_RUNLOCK(); return (error); } static void me_qflush(struct ifnet *ifp __unused) { } static const struct encaptab *ecookie = NULL; static const struct encap_config me_encap_cfg = { .proto = IPPROTO_MOBILE, .min_length = sizeof(struct ip) + sizeof(struct mobhdr) - sizeof(in_addr_t), .exact_match = ENCAP_DRV_LOOKUP, .lookup = me_lookup, .input = me_input }; static int memodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ecookie = ip_encap_attach(&me_encap_cfg, NULL, M_WAITOK); break; case MOD_UNLOAD: ip_encap_detach(ecookie); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t me_mod = { "if_me", memodevent, 0 }; DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_me, 1); Index: head/sys/net/if_var.h =================================================================== --- head/sys/net/if_var.h (revision 335923) +++ head/sys/net/if_var.h (revision 335924) @@ -1,767 +1,771 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; struct netdump_methods; #ifdef _KERNEL #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include CK_STAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ CK_STAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ #define V_link_pfil_hook VNET(link_pfil_hook) #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 #define HHOOK_IPSEC_COUNT 2 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); #define V_ipsec_hhh_in VNET(ipsec_hhh_in) #define V_ipsec_hhh_out VNET(ipsec_hhh_out) extern epoch_t net_epoch_preempt; extern epoch_t net_epoch; #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 0, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, IFCOUNTERS /* Array size. */ } ift_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ u_int tsomaxsegcount; /* TSO maximum segment count */ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; /* Interface encap request types */ typedef enum { IFENCAP_LL = 1 /* pre-calculate link-layer header */ } ife_type; /* * The structure below allows to request various pre-calculated L2/L3 headers * for different media. Requests varies by type (rtype field). * * IFENCAP_LL type: pre-calculates link header based on address family * and destination lladdr. * * Input data fields: * buf: pointer to destination buffer * bufsize: buffer size * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast * family: address family defined by AF_ constant. * lladdr: pointer to link-layer address * lladdr_len: length of link-layer address * hdata: pointer to L3 header (optional, used for ARP requests). * Output data fields: * buf: encap data is stored here * bufsize: resulting encap length is stored here * lladdr_off: offset of link-layer address from encap hdr start * hdata: L3 header may be altered if necessary */ struct if_encap_req { u_char *buf; /* Destination buffer (w) */ size_t bufsize; /* size of provided buffer (r) */ ife_type rtype; /* request type (r) */ uint32_t flags; /* Request flags (r) */ int family; /* Address family AF_* (r) */ int lladdr_off; /* offset from header start (w) */ int lladdr_len; /* lladdr length (r) */ char *lladdr; /* link-level address pointer (r) */ char *hdata; /* Upper layer header data (rw) */ }; #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ /* * Network interface send tag support. The storage of "struct * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 #define IF_SND_TAG_TYPE_MAX 2 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ }; struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ }; struct if_snd_tag_rate_limit_params { uint64_t max_rate; /* in bytes/s */ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 uint32_t reserved; /* padding */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; }; union if_snd_tag_modify_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); /* * Structure defining a network interface. */ struct ifnet { /* General book keeping of interface lists. */ CK_STAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained (CK_) */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ /* Addresses of different protocol families assigned to this if. */ struct mtx if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_hw_addr; /* hardware link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ struct mtx if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *); int (*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*if_bridge_linkstate)(struct ifnet *ifp); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ int (*if_requestencap) /* make link header from request */ (struct ifnet *, struct if_encap_req *); /* Statistics. */ counter_u64_t if_counters[IFCOUNTERS]; /* Stuff that's only temporary and doesn't belong here. */ /* * Network adapter TSO limits: * =========================== * * If the "if_hw_tsomax" field is zero the maximum segment * length limit does not apply. If the "if_hw_tsomaxsegcount" * or the "if_hw_tsomaxsegsize" field is zero the TSO segment * count limit does not apply. If all three fields are zero, * there is no TSO limit. * * NOTE: The TSO limits should reflect the values used in the * BUSDMA tag a network adapter is using to load a mbuf chain * for transmission. The TCP/IP network stack will subtract * space for all linklevel and protocol level headers and * ensure that the full mbuf chain passed to the network * adapter fits within the given limits. */ u_int if_hw_tsomax; /* TSO maximum size in bytes */ u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ /* * Network adapter send tag support: */ if_snd_tag_alloc_t *if_snd_tag_alloc; if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; /* Ethernet PCP */ uint8_t if_pcp; /* * Netdump hooks to be called while dumping. */ struct netdump_methods *if_netdump_methods; struct epoch_context if_epoch_ctx; + struct epoch_tracker if_addr_et; + struct epoch_tracker if_maddr_et; /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ int if_ispare[4]; /* general use */ }; /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) -#define IF_ADDR_RLOCK(if) epoch_enter_preempt(net_epoch_preempt); -#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt); +#define IF_ADDR_RLOCK(if) struct epoch_tracker if_addr_et; epoch_enter_preempt(net_epoch_preempt, &if_addr_et); +#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt, &if_addr_et); #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) -#define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch() || mtx_owned(&(if)->if_addr_lock)) +#define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) -#define NET_EPOCH_ENTER() epoch_enter_preempt(net_epoch_preempt) -#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt) +#define NET_EPOCH_ENTER() struct epoch_tracker nep_et; epoch_enter_preempt(net_epoch_preempt, &nep_et) +#define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) +#define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) /* * Function variations on locking macros intended to be used by loadable * kernel modules in order to divorce them from the internals of address list * locking. */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL #ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* Interface up/down event */ #define IFNET_EVENT_UP 0 #define IFNET_EVENT_DOWN 1 #define IFNET_EVENT_PCP 2 /* priority code point, PCP */ typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); #endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ }; struct ifg_member { CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) #define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RLOCK(ifp) epoch_enter_preempt(net_epoch_preempt) +#define IF_AFDATA_RLOCK(ifp) struct epoch_tracker if_afdata_et; epoch_enter_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt) +#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) -#define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch() || mtx_owned(&(ifp)->if_afdata_lock)) -#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch()); +#define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock)) +#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt)); #define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; struct epoch_context ifa_epoch_ctx; }; struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ struct ifmultiaddr { CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ struct epoch_context ifma_epoch_ctx; }; extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) -#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch()) +#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) -#define IFNET_RLOCK_NOSLEEP() epoch_enter_preempt(net_epoch_preempt) +#define IFNET_RLOCK_NOSLEEP() struct epoch_tracker ifnet_rlock_et; epoch_enter_preempt(net_epoch_preempt, &ifnet_rlock_et) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) -#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt) +#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt, &ifnet_rlock_et) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead of ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_locked(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) #ifdef MCAST_VERBOSE #define MCDPRINTF printf #else #define MCDPRINTF(...) #endif int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_delmulti_ifma_flags(struct ifmultiaddr *, int flags); void if_detach(struct ifnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, const struct sockaddr *); void if_freemulti(struct ifmultiaddr *ifma); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); int ifa_ifwithaddr_check(const struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_default(struct ifnet *, ift_counter); void if_inc_counter(struct ifnet *, ift_counter, int64_t); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_gethwaddr(if_t ifp, struct ifreq *); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_getmtu_family(if_t ifp, int family); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax); int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount); int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize); u_int if_gethwtsomax(if_t ifp); u_int if_gethwtsomaxsegcount(if_t ifp); u_int if_gethwtsomaxsegsize(if_t ifp); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_count(if_t ifp, int max); int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); void if_setgetcounterfn(if_t ifp, if_get_counter_t); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); /* TSO */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); int ether_poll_register(poll_handler_t *h, if_t ifp); int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ #endif /* _KERNEL */ #include /* XXXAO: temporary unconditional include */ #endif /* !_NET_IF_VAR_H_ */ Index: head/sys/net/route.c =================================================================== --- head/sys/net/route.c (revision 335923) +++ head/sys/net/route.c (revision 335924) @@ -1,2266 +1,2266 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } u_int rt_tables_get_gen(int table, int fam) { struct rib_head *rnh; rnh = *rt_tables_get_rnh_ptr(table, fam); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", __func__, table, fam)); return (rnh->rnh_gen); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Init locks */ RIB_LOCK_INIT(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); return (newrt); } else if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); /* * Either we hit the root or could not find any match, * which basically means: "cannot get there from here". */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; NET_EPOCH_ENTER(); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt) { if (!sa_equal(src, rt->rt_gateway)) { error = EINVAL; goto done; } if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { error = EINVAL; goto done; } } if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; goto done; } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) RTFREE_LOCKED(rt); flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; ifa_ref(ifa); info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); flags = rt->rt_flags; } stat = &V_rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ if ((flags & RTF_GATEWAY) == 0) rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: NET_EPOCH_EXIT(); if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) goto out; /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) goto out; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } out: return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = rt->rt_gateway; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = rt->rt_mtu; } info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; ifa_ref(info->rti_ifa); if (flags & NHR_REF) { /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * if @flags contains NHR_REF, refcouting is performed on rt_ifp. * All references can be released later by calling rib_free_info() * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over all existing fibs in system. * Deletes each element for which @filter_f function returned * non-zero value. * If @af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; uint32_t fibnum; int i, start, end; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { start = af; end = af; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) continue; /* We might have something to reclaim */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); RTFREE_LOCKED(rt); } } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes_af(struct ifnet *ifp, int af) { KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", __func__, af, AF_MAX)); rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); } void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, info->rti_filterdata) == 0) { /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int needref, error; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; needref = (info->rti_ifa == NULL); NET_EPOCH_ENTER(); if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (needref && info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; ifa_ref(info->rti_ifa); } else error = ENETUNREACH; NET_EPOCH_EXIT(); return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_gateway->sa_len != gw->sa_len || memcmp(rt->rt_gateway, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0; struct rtentry *rt, *rt_old; struct radix_node *rn; struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) return (error); } rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; ifa_ref(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); break; default: error = EOPNOTSUPP; } return (error); } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { /* * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags * to avoid rlock in the ifa_ifwithroute(). */ info->rti_flags |= RTF_RNH_LOCKED; error = rt_getifa_fib(info, fibnum); info->rti_flags &= ~RTF_RNH_LOCKED; if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL) { if (rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); rt->rt_ifa = NULL; } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their * llentry cache. */ rnh->rnh_gen++; if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { RIB_RLOCK_TRACKER; struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); ifa_ref(ifa); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: head/sys/net/rtsock.c =================================================================== --- head/sys/net/rtsock.c (revision 335923) +++ head/sys/net/rtsock.c (revision 335924) @@ -1,1977 +1,1974 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #define SA_SIZE32(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(int) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) ) #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void vnet_rts_init(void) { int tmp; if (IS_DEFAULT_VNET(curvnet)) { if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } #ifdef VIMAGE else netisr_register_vnet(&rtsock_nh); #endif } VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_init, 0); #ifdef VIMAGE static void vnet_rts_uninit(void) { netisr_unregister_vnet(&rtsock_nh); } VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { RIB_RLOCK_TRACKER; struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rnh; struct rt_addrinfo info; struct sockaddr_storage ss; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info.rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); } saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide longest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; NET_EPOCH_ENTER(); ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); NET_EPOCH_EXIT(); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RIB_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif #ifdef COMPAT_FREEBSD32 bool compat32 = false; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { len = sizeof(struct ifa_msghdrl32); compat32 = true; } else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); compat32 = true; break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); #ifdef COMPAT_FREEBSD32 if (compat32) dlen = SA_SIZE32(sa); else #endif dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else buflen -= dlen; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; if (ifp && ifp->if_addr) info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; else info.rti_info[RTAX_IFP] = NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct if_data ifd; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; + struct epoch_tracker et; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER_ET(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); - IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &ifd, &info, w, len); else error = sysctl_iflist_ifm(ifp, &ifd, &info, w, len); if (error) goto done; } while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } - IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: - if (ifp != NULL) - IF_ADDR_RUNLOCK(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT_ET(et); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; int error, len; error = 0; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) break; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error != 0) break; } } IF_ADDR_RUNLOCK(ifp); if (error != 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { RIB_RLOCK_TRACKER; int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { RIB_RLOCK(rnh); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[nitems(routesw)] }; VNET_DOMAIN_SET(route); Index: head/sys/netinet/in_gif.c =================================================================== --- head/sys/netinet/in_gif.c (revision 335923) +++ head/sys/netinet/in_gif.c (revision 335924) @@ -1,406 +1,407 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #define GIF_TTL 30 static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; #define V_ip_gif_ttl VNET(ip_gif_ttl) SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gif_ttl), 0, "Default TTL value for encapsulated packets"); /* * We keep interfaces in a hash table using src+dst as key. * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list. */ static VNET_DEFINE(struct gif_list *, ipv4_hashtbl) = NULL; static VNET_DEFINE(struct gif_list, ipv4_list) = CK_LIST_HEAD_INITIALIZER(); #define V_ipv4_hashtbl VNET(ipv4_hashtbl) #define V_ipv4_list VNET(ipv4_list) #define GIF_HASH(src, dst) (V_ipv4_hashtbl[\ in_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)]) #define GIF_HASH_SC(sc) GIF_HASH((sc)->gif_iphdr->ip_src.s_addr,\ (sc)->gif_iphdr->ip_dst.s_addr) static uint32_t in_gif_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static int in_gif_checkdup(const struct gif_softc *sc, in_addr_t src, in_addr_t dst) { struct gif_softc *tmp; if (sc->gif_family == AF_INET && sc->gif_iphdr->ip_src.s_addr == src && sc->gif_iphdr->ip_dst.s_addr == dst) return (EEXIST); CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->gif_iphdr->ip_src.s_addr == src && tmp->gif_iphdr->ip_dst.s_addr == dst) return (EADDRNOTAVAIL); } return (0); } static void in_gif_attach(struct gif_softc *sc) { if (sc->gif_options & GIF_IGNORE_SOURCE) CK_LIST_INSERT_HEAD(&V_ipv4_list, sc, chain); else CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain); } int in_gif_setopts(struct gif_softc *sc, u_int options) { /* NOTE: we are protected with gif_ioctl_sx lock */ MPASS(sc->gif_family == AF_INET); MPASS(sc->gif_options != options); if ((options & GIF_IGNORE_SOURCE) != (sc->gif_options & GIF_IGNORE_SOURCE)) { CK_LIST_REMOVE(sc, chain); sc->gif_options = options; in_gif_attach(sc); } return (0); } int in_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *dst, *src; struct ip *ip; int error; /* NOTE: we are protected with gif_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(*src)) break; if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } if (V_ipv4_hashtbl == NULL) V_ipv4_hashtbl = gif_hashinit(); error = in_gif_checkdup(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip = malloc(sizeof(*ip), M_GIF, M_WAITOK | M_ZERO); ip->ip_src.s_addr = src->sin_addr.s_addr; ip->ip_dst.s_addr = dst->sin_addr.s_addr; if (sc->gif_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); GIF_WAIT(); free(sc->gif_hdr, M_GIF); /* XXX: should we notify about link state change? */ } sc->gif_family = AF_INET; sc->gif_iphdr = ip; in_gif_attach(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gif_family != AF_INET) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); src->sin_addr = (cmd == SIOCGIFPSRCADDR) ? sc->gif_iphdr->ip_src: sc->gif_iphdr->ip_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { struct gif_softc *sc = ifp->if_softc; struct ip *ip; int len; /* prepend new IP header */ - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip = mtod(m, struct ip *); MPASS(sc->gif_family == AF_INET); bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); ip->ip_p = proto; /* version will be set in ip_output() */ ip->ip_ttl = V_ip_gif_ttl; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_tos = ecn; return (ip_output(m, NULL, NULL, 0, NULL, NULL)); } static int in_gif_input(struct mbuf *m, int off, int proto, void *arg) { struct gif_softc *sc = arg; struct ifnet *gifp; struct ip *ip; uint8_t ecn; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip = mtod(m, struct ip *); ecn = ip->ip_tos; m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); } return (IPPROTO_DONE); } static int in_gif_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct gif_softc *sc; int ret; if (V_ipv4_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); /* * NOTE: it is safe to iterate without any locking here, because softc * can be reclaimed only when we are not within net_epoch_preempt * section, but ip_encap lookup+input are executed in epoch section. */ ret = 0; CK_LIST_FOREACH(sc, &GIF_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { /* * This is an inbound packet, its ip_dst is source address * in softc. */ if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr && sc->gif_iphdr->ip_dst.s_addr == ip->ip_src.s_addr) { ret = ENCAP_DRV_LOOKUP; goto done; } } /* * No exact match. * Check the list of interfaces with GIF_IGNORE_SOURCE flag. */ CK_LIST_FOREACH(sc, &V_ipv4_list, chain) { if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr) { ret = 32 + 8; /* src + proto */ goto done; } } return (0); done: if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { struct nhop4_basic nh4; struct in_addr dst; dst = ip->ip_src; if (fib4_lookup_nh_basic(sc->gif_fibnum, dst, 0, 0, &nh4) != 0) return (0); if (nh4.nh_ifp != m->m_pkthdr.rcvif) return (0); } *arg = sc; return (ret); } static struct { const struct encap_config encap; const struct encaptab *cookie; } ipv4_encap_cfg[] = { { .encap = { .proto = IPPROTO_IPV4, .min_length = 2 * sizeof(struct ip), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in_gif_lookup, .input = in_gif_input }, }, #ifdef INET6 { .encap = { .proto = IPPROTO_IPV6, .min_length = sizeof(struct ip) + sizeof(struct ip6_hdr), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in_gif_lookup, .input = in_gif_input }, }, #endif { .encap = { .proto = IPPROTO_ETHERIP, .min_length = sizeof(struct ip) + sizeof(struct etherip_header) + sizeof(struct ether_header), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in_gif_lookup, .input = in_gif_input }, } }; void in_gif_init(void) { int i; if (!IS_DEFAULT_VNET(curvnet)) return; for (i = 0; i < nitems(ipv4_encap_cfg); i++) ipv4_encap_cfg[i].cookie = ip_encap_attach( &ipv4_encap_cfg[i].encap, NULL, M_WAITOK); } void in_gif_uninit(void) { int i; if (IS_DEFAULT_VNET(curvnet)) { for (i = 0; i < nitems(ipv4_encap_cfg); i++) ip_encap_detach(ipv4_encap_cfg[i].cookie); } if (V_ipv4_hashtbl != NULL) gif_hashdestroy(V_ipv4_hashtbl); } Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 335923) +++ head/sys/netinet/in_pcb.c (revision 335924) @@ -1,3458 +1,3453 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef TCPHPTS #include #endif #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (!grp) return (NULL); grp->il_vflag = vflag; grp->il_lport = port; grp->il_dependladdr = *addr; grp->il_inpsiz = size; LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { LIST_REMOVE(grp, il_list); free(grp, M_TEMP); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size); if (!grp) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= (grp->il_inpsiz / 4)) { /* Shrink this group. */ struct inpcblbgroup *new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp) *grpp = new_grp; } return; } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint16_t hashmask, lport; uint32_t group_index; struct ucred *cred; static int limit_logged = 0; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (pcbinfo->ipi_lbgrouphashbase == NULL) return (0); hashmask = pcbinfo->ipi_lbgrouphashmask; lport = inp->inp_lport; group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[group_index]; /* * Don't allow jailed socket to join local group. */ if (inp->inp_socket != NULL) cred = inp->inp_socket->so_cred; else cred = NULL; if (cred != NULL && jailed(cred)) return (0); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) { break; } } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN); if (!grp) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (!limit_logged) { limit_logged = 1; printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); } return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (!grp) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (pcbinfo->ipi_lbgrouphashbase == NULL) return; hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } return; } } } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ int inp_so_options(const struct inpcb *inp) { int so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) so_options |= SO_REUSEPORT_LB; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || ((reuseport & tw->tw_so_options) == 0 && (reuseport_lb & tw->tw_so_options) == 0)) { return (EADDRINUSE); } } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ NET_EPOCH_ENTER(); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: NET_EPOCH_EXIT(); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } void in_pcblist_rele_rlocked(epoch_context_t ctx) { struct in_pcblist *il; struct inpcb *inp; struct inpcbinfo *pcbinfo; int i, n; il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); pcbinfo = il->il_pcbinfo; n = il->il_count; INP_INFO_WLOCK(pcbinfo); for (i = 0; i < n; i++) { inp = il->il_inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); free(il, M_TEMP); } static void inpcbport_free(epoch_context_t ctx) { struct inpcbport *phd; phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); free(phd, M_PCB); } static void in_pcbfree_deferred(epoch_context_t ctx) { struct inpcb *inp; int released __unused; inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); INP_WLOCK(inp); #ifdef INET inp_freemoptions(inp->inp_moptions); inp->inp_moptions = NULL; #endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); ip6_freemoptions(inp->in6p_moptions); inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); inp->inp_vflag = 0; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif released = in_pcbrele_wlocked(inp); MPASS(released); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK(pcbinfo); in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); RO_INVALIDATE_CACHE(&inp->inp_route); /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; INP_WUNLOCK(inp); epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); +#ifdef INVARIANTS + if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) + MPASS(inp->inp_refcount > 1); +#endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { IN_MULTI_LOCK_ASSERT(); in_leavegroup_locked(imo->imo_membership[i], NULL); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, uint16_t fport, int lookupflags) { struct inpcb *local_wild = NULL; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *grp_local_wild; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Order of socket selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: * - Load balanced group does not contain jailed sockets * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport == lport) { uint32_t idx = 0; int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport); idx = pkt_hash % grp->il_inpcnt; if (grp->il_laddr.s_addr == laddr->s_addr) { return (grp->il_inp[idx]); } else { if (grp->il_laddr.s_addr == INADDR_ANY && (lookupflags & INPLOOKUP_WILDCARD)) { local_wild = grp->il_inp[idx]; grp_local_wild = grp; } } } } if (local_wild != NULL) { return (local_wild); } return (NULL); } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (NULL); } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; +#ifdef INVARIANTS KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - - INP_HASH_LOCK_ASSERT(pcbinfo); - + if (!mtx_owned(&pcbinfo->ipi_hash_lock)) + MPASS(in_epoch_verbose(net_epoch_preempt, 1)); +#endif /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look in lb group (for wildcard match). */ if (pcbinfo->ipi_lbgrouphashbase != NULL && (lookupflags & INPLOOKUP_WILDCARD)) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags); if (inp != NULL) { return (inp); } } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_WUNLOCK(inp); inp = NULL; } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); inp = NULL; } } else panic("%s: locking bug", __func__); #ifdef INVARIANTS if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); } #endif } INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { int ret = in_pcbinslbgrouphash(inp); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); } } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); /* XXX: Only do if SO_REUSEPORT_LB set? */ in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); else bzero(&xi->xi_socket, sizeof(struct xsocket)); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, }; int error; INP_WLOCK_ASSERT(inp); if (inp->inp_snd_tag != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); /* * At success increment the refcount on * the send tag's network interface: */ if (error == 0) if_ref(inp->inp_snd_tag->ifp); } return (error); } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; struct ifnet *ifp; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; ifp = mst->ifp; if (ifp == NULL) return; /* * If the device was detached while we still had reference(s) * on the ifp, we assume if_snd_tag_free() was replaced with * stubs. */ ifp->if_snd_tag_free(mst); /* release reference count on network interface */ if_rele(ifp); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { struct socket *socket; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #endif /* RATELIMIT */ Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h (revision 335923) +++ head/sys/netinet/in_pcb.h (revision 335924) @@ -1,891 +1,894 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #endif #include #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ CK_LIST_HEAD(inpcbhead, inpcb); CK_LIST_HEAD(inpcbporthead, inpcbport); typedef uint64_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; union in_dependaddr { struct in_addr_4in6 id46_addr; struct in6_addr id6_addr; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport, * lport, faddr to generate hash, so these fields shouldn't be moved. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union in_dependaddr ie_dependfaddr; /* foreign host table entry */ union in_dependaddr ie_dependladdr; /* local host table entry */ #define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4 #define ie_laddr ie_dependladdr.id46_addr.ia46_addr4 #define ie6_faddr ie_dependfaddr.id6_addr #define ie6_laddr ie_dependladdr.id6_addr u_int32_t ie6_zoneid; /* scope zone id */ }; /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ /* protocol dependent part */ struct in_endpoints inc_ie; }; /* * Flags for inc_flags. */ #define INC_ISIPV6 0x01 #define inc_isipv6 inc_flags /* temp compatibility */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr #define inc6_zoneid inc_ie.ie6_zoneid #if defined(_KERNEL) || defined(_WANT_INPCB) /* * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A * few fields are protected by multiple locks as indicated in the locking notes * below. For these fields, all of the listed locks must be write-locked for * any modifications. However, these fields can be safely read while any one of * the listed locks are read-locked. This model can permit greater concurrency * for read operations. For example, connections can be looked up while only * holding a read lock on the global pcblist lock. This is important for * performance when attempting to find the connection for a packet given its IP * and port tuple. * * One noteworthy exception is that the global pcbinfo lock follows a different * set of rules in relation to the inp_list field. Rather than being * write-locked for modifications and read-locked for list iterations, it must * be read-locked during modifications and write-locked during list iterations. * This ensures that the relatively rare global list iterations safely walk a * stable snapshot of connections while allowing more common list modifications * to safely grab the pcblist lock just while adding or removing a connection * from the global list. * * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (e) - Protected by the net_epoch_prempt epoch * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (l) - Protected by the pcblist lock for the inpcb * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * * Notes on the tcp_hpts: * * First Hpts lock order is * 1) INP_WLOCK() * 2) HPTS_LOCK() i.e. hpts->pmtx * * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). * You may check the inp->inp_in_hpts flag without the hpts lock. * The hpts is the only one that will clear this flag holding * only the hpts lock. This means that in your tcp_output() * routine when you test for the inp_in_hpts flag to be 1 * it may be transitioning to 0 (by the hpts). * That's ok since that will just mean an extra call to tcp_output * that most likely will find the call you executed * (when the mis-match occured) will have put the TCB back * on the hpts and it will return. If your * call did not add the inp back to the hpts then you will either * over-send or the cwnd will block you from sending more. * * Note you should also be holding the INP_WLOCK() when you * call the remove from the hpts as well. Though usually * you are either doing this from a timer, where you need and have * the INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). * * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and * inp_input_cpu_set fields are controlled completely by * the hpts. Do not ever set these. The inp_hpts_cpu_set * and inp_input_cpu_set fields indicate if the hpts has * setup the respective cpu field. It is advised if this * field is 0, to enqueue the packet with the appropriate * hpts_immediate() call. If the _set field is 1, then * you may compare the inp_*_cpu field to the curcpu and * may want to again insert onto the hpts if these fields * are not equal (i.e. you are not on the expected CPU). * * A note on inp_hpts_calls and inp_input_calls, these * flags are set when the hpts calls either the output * or do_segment routines respectively. If the routine * being called wants to use this, then it needs to * clear the flag before returning. The hpts will not * clear the flag. The flags can be used to tell if * the hpts is the function calling the respective * routine. * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write * to a field, a write lock must generally be held. * * netinet/netinet6-layer code should not assume that the inp_socket pointer * is safe to dereference without inp_lock being held, even for protocols * other than TCP (where the inpcb persists during TIMEWAIT even after the * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). * * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock * read-lock usage during modification, this model can be applied to other * protocols (especially SCTP). */ struct icmp6_filter; struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */ uint32_t inp_hpts_request; /* Current hpts request, zero if * fits in the pacing window (i&b). */ /* * Note the next fields are protected by a * different lock (hpts-lock). This means that * they must correspond in size to the smallest * protectable bit field (uint8_t on x86, and * other platfomrs potentially uint32_t?). Also * since CPU switches can occur at different times the two * fields can *not* be collapsed into a signal bit field. */ #if defined(__amd64__) || defined(__i386__) volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */ volatile uint8_t inp_in_input; /* on input hpts (lock b) */ #else volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */ volatile uint32_t inp_in_input; /* on input hpts (lock b) */ #endif volatile uint16_t inp_hpts_cpu; /* Lock (i) */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ volatile uint16_t inp_input_cpu; /* Lock (i) */ volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ inp_input_cpu_set : 1, /* on input hpts (i) */ inp_hpts_calls :1, /* (i) from output hpts */ inp_input_calls :1, /* (i) from input hpts */ inp_spare_bits2 : 4; uint8_t inp_spare_byte; /* Compiler hole */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ u_char inp_ip_ttl; /* (i) time to live proto */ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp_ip_tos; /* (i) type of service proto */ struct mbuf *inp_options; /* (i) IP options */ struct ip_moptions *inp_moptions; /* (i) mcast options */ }; struct { /* (i) IP options */ struct mbuf *in6p_options; /* (i) IP6 options for outgoing packets */ struct ip6_pktopts *in6p_outputopts; /* (i) IP multicast options */ struct ip6_moptions *in6p_moptions; /* (i) ICMPv6 code type filter */ struct icmp6_filter *in6p_icmp6filt; /* (i) IPV6_CHECKSUM setsockopt */ int in6p_cksum; short in6p_hops; }; CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ struct inpcbport *inp_phd; /* (i/h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ rt_gen_t inp_rt_cookie; /* generation for route entry */ union { /* cached L3 information */ struct route inp_route; struct route_in6 inp_route6; }; CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ /* (e[r]) for list iteration */ /* (p[w]/l) for addition/removal */ struct epoch_context inp_epoch_ctx; }; #endif /* _KERNEL */ #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_zoneid inp_inc.inc6_zoneid #define in6p_flowinfo inp_flow #define inp_vnet inp_pcbinfo->ipi_vnet /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. * Fields prefixed with "xi_" are unique to this structure, and the rest * match fields in the struct inpcb, to ease coding and porting. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct xsocket xi_socket; /* (s,p) */ struct in_conninfo inp_inc; /* (s,p) */ uint64_t inp_gencnt; /* (s,p) */ union { void *inp_ppcb; /* (s) netstat(1) */ int64_t ph_ppcb; }; int64_t inp_spare64[4]; uint32_t inp_flow; /* (s) */ uint32_t inp_flowid; /* (s) */ uint32_t inp_flowtype; /* (s) */ int32_t inp_flags; /* (s,p) */ int32_t inp_flags2; /* (s) */ int32_t inp_rss_listen_bucket; /* (n) */ int32_t in6p_cksum; /* (n) */ int32_t inp_spare32[4]; uint16_t in6p_hops; /* (n) */ uint8_t inp_ip_tos; /* (n) */ int8_t pad8; uint8_t inp_vflag; /* (s,p) */ uint8_t inp_ip_ttl; /* (n) */ uint8_t inp_ip_p; /* (n) */ uint8_t inp_ip_minttl; /* (n) */ int8_t inp_spare8[4]; } __aligned(8); struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count this time */ } __aligned(8); #ifdef _KERNEL void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); #endif #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { struct epoch_context phd_epoch_ctx; CK_LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; struct in_pcblist { int il_count; struct epoch_context il_epoch_ctx; struct inpcbinfo *il_pcbinfo; struct inpcb *il_inp_list[0]; }; /*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and * ipi_list_lock: * - ipi_lock covering the global pcb list stability during loop iteration, * - ipi_hash_lock covering the hashed lookup tables, * - ipi_list_lock covering mutable global fields (such as the global * pcb list) * * The lock order is: * * ipi_lock (before) * inpcb locks (before) * ipi_list locks (before) * {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (e) - Protected by the net_epoch_prempt epoch * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* * Global lock protecting inpcb list modification */ struct mtx ipi_lock; /* * Global list of inpcbs on the protocol. */ struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ u_int ipi_count; /* (l) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; /* (l) */ /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; /* (x) */ u_short ipi_lastlow; /* (x) */ u_short ipi_lasthi; /* (x) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ struct uma_zone *ipi_zone; /* (c) */ /* * Connection groups associated with this protocol. These fields are * constant, but pcbgroup structures themselves are protected by * per-pcbgroup locks. */ struct inpcbgroup *ipi_pcbgroups; /* (c) */ u_int ipi_npcbgroups; /* (c) */ u_int ipi_hashfields; /* (c) */ /* * Global lock protecting modification non-pcbgroup hash lookup tables. */ struct mtx ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ struct inpcbhead *ipi_hashbase; /* (h) */ u_long ipi_hashmask; /* (h) */ /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* * List of wildcard inpcbs for use with pcbgroups. In the past, was * per-pcbgroup but is now global. All pcbgroup locks must be held * to modify the list, so any is sufficient to read it. */ struct inpcbhead *ipi_wildbase; /* (p) */ u_long ipi_wildmask; /* (p) */ /* * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. */ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */ u_long ipi_lbgrouphashmask; /* (h) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ /* * general use 2 */ void *ipi_pspare[2]; /* * Global lock protecting global inpcb list, inpcb count, etc. */ struct rwlock ipi_list_lock; }; #ifdef _KERNEL /* * Connection groups hold sets of connections that have similar CPU/thread * affinity. Each connection belongs to exactly one connection group. */ struct inpcbgroup { /* * Per-connection group hash of inpcbs, hashed by local and foreign * addresses and port numbers. */ struct inpcbhead *ipg_hashbase; /* (c) */ u_long ipg_hashmask; /* (c) */ /* * Notional affinity of this pcbgroup. */ u_int ipg_cpu; /* (p) */ /* * Per-connection group lock, not to be confused with ipi_lock. * Protects the hash table hung off the group, but also the global * wildcard list in inpcbinfo. */ struct mtx ipg_lock; } __aligned(CACHE_LINE_SIZE); /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which * is dynamically resized as processes bind/unbind to that specific group. */ struct inpcblbgroup { LIST_ENTRY(inpcblbgroup) il_list; uint16_t il_lport; /* (c) */ u_char il_vflag; /* (c) */ u_char il_pad; uint32_t il_pad2; union in_dependaddr il_dependladdr; /* (c) */ #define il_laddr il_dependladdr.id46_addr.ia46_addr4 #define il6_laddr il_dependladdr.id6_addr uint32_t il_inpsiz; /* max count in il_inp[] (h) */ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ struct inpcb *il_inp[]; /* (h) */ }; LIST_HEAD(inpcblbgrouphead, inpcblbgroup); #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) #define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) #define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) #define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) #define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else #define inp_lock_assert(inp) do {} while (0) #define inp_unlock_assert(inp) do {} while (0) #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg); int inp_ip_tos_get(const struct inpcb *inp); void inp_ip_tos_set(struct inpcb *inp, int val); struct socket * inp_inpcbtosocket(struct inpcb *inp); struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER_ET((et)) #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) +#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock)) -#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch()) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) -#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) #define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) #define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) #define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) #define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) #define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) #define INP_LIST_LOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) #define INP_LIST_RLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) #define INP_LIST_WLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) #define INP_LIST_UNLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) #define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) #define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et) +#define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_hash_et) +#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) #define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) #define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) #define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) #define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) #define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) #define INP_PCBLBGROUP_PORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) #define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* * Flags for inp_vflags -- historically version flags only */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ /* * Flags for inp_flags. */ #define INP_RECVOPTS 0x00000001 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */ #define INP_HDRINCL 0x00000008 /* user supplies entire IP header */ #define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */ #define INP_LOWPORT 0x00000020 /* user wants "low" port binding */ #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ /* 0x000200 unused: was INP_FAITH */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ #define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */ #define INP_RECVTOS 0x00004000 /* receive incoming IP TOS */ #define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x00100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x00400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ #define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */ #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ #define INP_RESERVED_0 0x10000000 /* reserved field */ #define INP_RESERVED_1 0x20000000 /* reserved field */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) /* * Flags for inp_flags2. */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ /* * Flags passed to in_pcblookup*() functions. */ #define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ #define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ #define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) /* * Constants for pcbinfo.ipi_hashfields. */ #define IPI_HASHFIELDS_NONE 0 #define IPI_HASHFIELDS_2TUPLE 1 #define IPI_HASHFIELDS_4TUPLE 2 #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); VNET_DECLARE(int, ipport_lowfirstauto); VNET_DECLARE(int, ipport_lowlastauto); VNET_DECLARE(int, ipport_firstauto); VNET_DECLARE(int, ipport_lastauto); VNET_DECLARE(int, ipport_hifirstauto); VNET_DECLARE(int, ipport_hilastauto); VNET_DECLARE(int, ipport_randomized); VNET_DECLARE(int, ipport_randomcps); VNET_DECLARE(int, ipport_randomtime); VNET_DECLARE(int, ipport_stoprandom); VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_reservedhigh VNET(ipport_reservedhigh) #define V_ipport_reservedlow VNET(ipport_reservedlow) #define V_ipport_lowfirstauto VNET(ipport_lowfirstauto) #define V_ipport_lowlastauto VNET(ipport_lowlastauto) #define V_ipport_firstauto VNET(ipport_firstauto) #define V_ipport_lastauto VNET(ipport_lastauto) #define V_ipport_hifirstauto VNET(ipport_hifirstauto) #define V_ipport_hilastauto VNET(ipport_hilastauto) #define V_ipport_randomized VNET(ipport_randomized) #define V_ipport_randomcps VNET(ipport_randomcps) #define V_ipport_randomtime VNET(ipport_randomtime) #define V_ipport_stoprandom VNET(ipport_stoprandom) #define V_ipport_tcpallocs VNET(ipport_tcpallocs) void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int, int, char *, uma_init, u_int); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); struct inpcbgroup * in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); struct inpcbgroup * in_pcbgroup_byinpcb(struct inpcb *); struct inpcbgroup * in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, struct in_addr, u_short); void in_pcbgroup_destroy(struct inpcbinfo *); int in_pcbgroup_enabled(struct inpcbinfo *); void in_pcbgroup_init(struct inpcbinfo *, u_int, int); void in_pcbgroup_remove(struct inpcb *); void in_pcbgroup_update(struct inpcb *); void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, struct mbuf *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); int in_pcbinshash_nopcbgroup(struct inpcb *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); void in_pcblist_rele_rlocked(epoch_context_t ctx); void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t); void in_pcbdetach_txrtlmt(struct inpcb *); int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); int in_pcbquery_txrlevel(struct inpcb *, uint32_t *); void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *); void in_pcboutput_eagain(struct inpcb *); #endif #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 335923) +++ head/sys/netinet/ip_divert.c (revision 335924) @@ -1,830 +1,833 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #ifndef INET #error "IPDIVERT requires INET" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef SCTP #include #endif #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw or other packet filters, * see the divert(4) manpage for features. * Packets are selected by the packet filter and tagged with an * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by * the packet filter) and information on the matching filter rule for * subsequent reinjection. The divert_port is used to put the packet * on the corresponding divert socket, while the rule number is passed * up (at least partially) as the sin_port in the struct sockaddr. * * Packets written to the divert socket carry in sin_addr a * destination address, and in sin_port the number of the filter rule * after which to continue processing. * If the destination address is INADDR_ANY, the packet is treated as * as outgoing and sent to ip_output(); otherwise it is treated as * incoming and sent to ip_input(). * Further, sin_zero carries some information on the interface, * which can be used in the reinject -- see comments in the code. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * packet filter processing will start at the rule number after the one * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 * will apply the entire ruleset to the packet). */ /* Internal variables. */ static VNET_DEFINE(struct inpcbhead, divcb); static VNET_DEFINE(struct inpcbinfo, divcbinfo); #define V_divcb VNET(divcb) #define V_divcbinfo VNET(divcbinfo) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ static eventhandler_tag ip_divert_event_tag; /* * Initialize divert connection block queue. */ static void div_zone_change(void *tag) { uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); } static int div_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "divinp"); return (0); } static void div_init(void) { /* * XXX We don't use the hash list for divert IP, but it's easier to * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", div_inpcb_init, IPI_HASHFIELDS_NONE); } static void div_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_divcbinfo); } VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, div_destroy, NULL); /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ static int div_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; KMOD_IPSTAT_INC(ips_noproto); m_freem(m); return (IPPROTO_DONE); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ static void divert_packet(struct mbuf *m, int incoming) { struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; + struct epoch_tracker et; mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) return; ip = mtod(m, struct ip *); /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; /* record matching rule, in host format */ divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; /* * Record receive interface address, if any. * But only for incoming packets. */ if (incoming) { struct ifaddr *ifa; struct ifnet *ifp; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } if_addr_runlock(ifp); } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_RLOCK_ET(&V_divcbinfo, et); CK_LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { INP_RLOCK(inp); sa = inp->inp_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, (struct mbuf *)0) == 0) { SOCKBUF_UNLOCK(&sa->so_rcv); sa = NULL; /* force mbuf reclaim below */ } else sorwakeup_locked(sa); INP_RUNLOCK(inp); break; } } - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); if (sa == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_noproto); KMOD_IPSTAT_DEC(ips_delivered); } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { struct ip *const ip = mtod(m, struct ip *); struct m_tag *mtag; struct ipfw_rule_ref *dt; int error = 0; /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; M_SETFIB(m, so->so_fibnum); if (control) m_freem(control); /* XXX */ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { /* this should be normal */ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } m_tag_prepend(m, mtag); } dt = (struct ipfw_rule_ref *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; /* set the starting point. We provide a non-zero slot, * but a non_matching chain_id to skip that info and use * the rulenum/rule_id. */ dt->slot = 1; /* dummy, chain_id is invalid */ dt->chain_id = 0; dt->rulenum = sin->sin_port+1; /* host format ? */ dt->rule_id = 0; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct mbuf *options = NULL; struct inpcb *inp; dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; inp = sotoinpcb(so); INP_RLOCK(inp); switch (ip->ip_v) { case IPVERSION: /* * Don't allow both user specified and setsockopt * options, and don't allow packet length sizes that * will crash. */ if ((((ip->ip_hl << 2) != sizeof(struct ip)) && inp->inp_options != NULL) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } break; #ifdef INET6 case IPV6_VERSION >> 4: { struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); /* Don't allow packet length sizes that will crash */ if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } break; } #endif default: error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } /* Send packet to output processing */ KMOD_IPSTAT_INC(ips_rawout); /* XXX */ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Get ready to inject the packet into ip_output(). * Just in case socket options were specified on the * divert socket, we duplicate them. This is done * to avoid having to hold the PCB locks over the call * to ip_output(), as doing this results in a number of * lock ordering complexities. * * Note that we set the multicast options argument for * ip_output() to NULL since it should be invariant that * they are not present. */ KASSERT(inp->inp_moptions == NULL, ("multicast options set on a divert socket")); /* * XXXCSJP: It is unclear to me whether or not it makes * sense for divert sockets to have options. However, * for now we will duplicate them with the INP locks * held so we can use them in ip_output() without * requring a reference to the pcb. */ if (inp->inp_options != NULL) { options = m_dup(inp->inp_options, M_NOWAIT); if (options == NULL) { INP_RUNLOCK(inp); error = ENOBUFS; goto cantsend; } } INP_RUNLOCK(inp); switch (ip->ip_v) { case IPVERSION: error = ip_output(m, options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); break; #ifdef INET6 case IPV6_VERSION >> 4: error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); break; #endif } if (options != NULL) m_freem(options); } else { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; NET_EPOCH_EXIT(); goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; NET_EPOCH_EXIT(); } #ifdef MAC mac_socket_create_mbuf(so, m); #endif /* Send packet to input processing via netisr */ switch (ip->ip_v) { case IPVERSION: /* * Restore M_BCAST flag when destination address is * broadcast. It is expected by ip_tryforward(). */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) m->m_flags |= M_MCAST; else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) m->m_flags |= M_BCAST; netisr_queue_src(NETISR_IP, (uintptr_t)so, m); break; #ifdef INET6 case IPV6_VERSION >> 4: netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); break; #endif default: error = EINVAL; goto cantsend; } } return (error); cantsend: m_freem(m); return (error); } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("div_attach: inp != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; INP_INFO_WLOCK(&V_divcbinfo); error = in_pcballoc(so, &V_divcbinfo); if (error) { INP_INFO_WUNLOCK(&V_divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; INP_WUNLOCK(inp); return 0; } static void div_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_divcbinfo); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_bind: inp == NULL")); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) return EAFNOSUPPORT; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } static void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) return; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_divcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_WLOCK(&V_divcbinfo); gencnt = V_divcbinfo.ipi_gencnt; n = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_WUNLOCK(&V_divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return ENOMEM; - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_RLOCK_ET(&V_divcbinfo, et); for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_divcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_divcbinfo); if (!error) { + struct epoch_tracker et; /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_RLOCK_ET(&V_divcbinfo, et); xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } #ifdef SYSCTL_NODE static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif struct pr_usrreqs div_usrreqs = { .pru_attach = div_attach, .pru_bind = div_bind, .pru_control = in_control, .pru_detach = div_detach, .pru_peeraddr = in_getpeeraddr, .pru_send = div_send, .pru_shutdown = div_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel }; struct protosw div_protosw = { .pr_type = SOCK_RAW, .pr_protocol = IPPROTO_DIVERT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = div_input, .pr_ctlinput = div_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = div_init, .pr_usrreqs = &div_usrreqs }; static int div_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* * Protocol will be initialized by pf_proto_register(). * We don't have to register ip_protox because we are not * a true IP protocol that goes over the wire. */ err = pf_proto_register(PF_INET, &div_protosw); if (err != 0) return (err); ip_divert_ptr = divert_packet; ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. */ INP_INFO_WLOCK(&V_divcbinfo); if (V_divcbinfo.ipi_count != 0) { err = EBUSY; INP_INFO_WUNLOCK(&V_divcbinfo); break; } ip_divert_ptr = NULL; err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); #ifndef VIMAGE div_destroy(NULL); #endif EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3); MODULE_VERSION(ipdivert, 1); Index: head/sys/netinet/ip_encap.c =================================================================== --- head/sys/netinet/ip_encap.c (revision 335923) +++ head/sys/netinet/ip_encap.c (revision 335924) @@ -1,270 +1,270 @@ /* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * My grandfather said that there's a devil inside tunnelling technology... * * We have surprisingly many protocols that want packets with IP protocol * #4 or #41. Here's a list of protocols that want protocol #41: * RFC1933 configured tunnel * RFC1933 automatic tunnel * RFC2401 IPsec tunnel * RFC2473 IPv6 generic packet tunnelling * RFC2529 6over4 tunnel * mobile-ip6 (uses RFC2473) * RFC3056 6to4 tunnel * isatap tunnel * Here's a list of protocol that want protocol #4: * RFC1853 IPv4-in-IPv4 tunnelling * RFC2003 IPv4 encapsulation within IPv4 * RFC2344 reverse tunnelling for mobile-ip4 * RFC2401 IPsec tunnel * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has * AH/ESP/IPComp header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure"); struct encaptab { CK_LIST_ENTRY(encaptab) chain; int proto; int min_length; int exact_match; void *arg; encap_lookup_t lookup; encap_check_t check; encap_input_t input; }; CK_LIST_HEAD(encaptab_head, encaptab); #ifdef INET static struct encaptab_head ipv4_encaptab = CK_LIST_HEAD_INITIALIZER(); #endif #ifdef INET6 static struct encaptab_head ipv6_encaptab = CK_LIST_HEAD_INITIALIZER(); #endif static struct mtx encapmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); #define ENCAP_WLOCK() mtx_lock(&encapmtx) #define ENCAP_WUNLOCK() mtx_unlock(&encapmtx) -#define ENCAP_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define ENCAP_RLOCK() struct epoch_tracker encap_et; epoch_enter_preempt(net_epoch_preempt, &encap_et) +#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &encap_et) #define ENCAP_WAIT() epoch_wait_preempt(net_epoch_preempt) static struct encaptab * encap_attach(struct encaptab_head *head, const struct encap_config *cfg, void *arg, int mflags) { struct encaptab *ep, *tmp; if (cfg == NULL || cfg->input == NULL || (cfg->check == NULL && cfg->lookup == NULL) || (cfg->lookup != NULL && cfg->exact_match != ENCAP_DRV_LOOKUP) || (cfg->exact_match == ENCAP_DRV_LOOKUP && cfg->lookup == NULL)) return (NULL); ep = malloc(sizeof(*ep), M_NETADDR, mflags); if (ep == NULL) return (NULL); ep->proto = cfg->proto; ep->min_length = cfg->min_length; ep->exact_match = cfg->exact_match; ep->arg = arg; ep->lookup = cfg->exact_match == ENCAP_DRV_LOOKUP ? cfg->lookup: NULL; ep->check = cfg->exact_match != ENCAP_DRV_LOOKUP ? cfg->check: NULL; ep->input = cfg->input; ENCAP_WLOCK(); CK_LIST_FOREACH(tmp, head, chain) { if (tmp->exact_match <= ep->exact_match) break; } if (tmp == NULL) CK_LIST_INSERT_HEAD(head, ep, chain); else CK_LIST_INSERT_BEFORE(tmp, ep, chain); ENCAP_WUNLOCK(); return (ep); } static int encap_detach(struct encaptab_head *head, const struct encaptab *cookie) { struct encaptab *ep; ENCAP_WLOCK(); CK_LIST_FOREACH(ep, head, chain) { if (ep == cookie) { CK_LIST_REMOVE(ep, chain); ENCAP_WUNLOCK(); ENCAP_WAIT(); free(ep, M_NETADDR); return (0); } } ENCAP_WUNLOCK(); return (EINVAL); } static int encap_input(struct encaptab_head *head, struct mbuf *m, int off, int proto) { struct encaptab *ep, *match; void *arg; int matchprio, ret; match = NULL; matchprio = 0; ENCAP_RLOCK(); CK_LIST_FOREACH(ep, head, chain) { if (ep->proto >= 0 && ep->proto != proto) continue; if (ep->min_length > m->m_pkthdr.len) continue; if (ep->exact_match == ENCAP_DRV_LOOKUP) ret = (*ep->lookup)(m, off, proto, &arg); else ret = (*ep->check)(m, off, proto, ep->arg); if (ret <= 0) continue; if (ret > matchprio) { match = ep; if (ep->exact_match != ENCAP_DRV_LOOKUP) arg = ep->arg; /* * No need to continue the search, we got the * exact match. */ if (ret >= ep->exact_match) break; matchprio = ret; } } if (match != NULL) { /* found a match, "match" has the best one */ ret = (*match->input)(m, off, proto, arg); ENCAP_RUNLOCK(); MPASS(ret == IPPROTO_DONE); return (IPPROTO_DONE); } ENCAP_RUNLOCK(); return (0); } #ifdef INET const struct encaptab * ip_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { return (encap_attach(&ipv4_encaptab, cfg, arg, mflags)); } int ip_encap_detach(const struct encaptab *cookie) { return (encap_detach(&ipv4_encaptab, cookie)); } int encap4_input(struct mbuf **mp, int *offp, int proto) { if (encap_input(&ipv4_encaptab, *mp, *offp, proto) != IPPROTO_DONE) return (rip_input(mp, offp, proto)); return (IPPROTO_DONE); } #endif /* INET */ #ifdef INET6 const struct encaptab * ip6_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { return (encap_attach(&ipv6_encaptab, cfg, arg, mflags)); } int ip6_encap_detach(const struct encaptab *cookie) { return (encap_detach(&ipv6_encaptab, cookie)); } int encap6_input(struct mbuf **mp, int *offp, int proto) { if (encap_input(&ipv6_encaptab, *mp, *offp, proto) != IPPROTO_DONE) return (rip6_input(mp, offp, proto)); return (IPPROTO_DONE); } #endif /* INET6 */ Index: head/sys/netinet/ip_gre.c =================================================================== --- head/sys/netinet/ip_gre.c (revision 335923) +++ head/sys/netinet/ip_gre.c (revision 335924) @@ -1,299 +1,300 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; #define V_ip_gre_ttl VNET(ip_gre_ttl) SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); static VNET_DEFINE(struct gre_list *, ipv4_hashtbl) = NULL; #define V_ipv4_hashtbl VNET(ipv4_hashtbl) #define GRE_HASH(src, dst) (V_ipv4_hashtbl[\ in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH((sc)->gre_oip.ip_src.s_addr,\ (sc)->gre_oip.ip_dst.s_addr) static uint32_t in_gre_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static int in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst) { struct gre_softc *tmp; if (sc->gre_family == AF_INET && sc->gre_oip.ip_src.s_addr == src && sc->gre_oip.ip_dst.s_addr == dst) return (EEXIST); CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->gre_oip.ip_src.s_addr == src && tmp->gre_oip.ip_dst.s_addr == dst) return (EADDRNOTAVAIL); } return (0); } static int in_gre_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct gre_softc *sc; if (V_ipv4_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &GRE_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { /* * This is an inbound packet, its ip_dst is source address * in softc. */ if (sc->gre_oip.ip_src.s_addr == ip->ip_dst.s_addr && sc->gre_oip.ip_dst.s_addr == ip->ip_src.s_addr) { if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } static void in_gre_attach(struct gre_softc *sc) { sc->gre_hlen = sizeof(struct greip); sc->gre_oip.ip_v = IPVERSION; sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; sc->gre_oip.ip_p = IPPROTO_GRE; gre_updatehdr(sc, &sc->gre_gihdr->gi_gre); CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); } void in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { MPASS(cmd == GRESKEY || cmd == GRESOPTS); /* NOTE: we are protected with gre_ioctl_sx lock */ MPASS(sc->gre_family == AF_INET); CK_LIST_REMOVE(sc, chain); GRE_WAIT(); if (cmd == GRESKEY) sc->gre_key = value; else sc->gre_options = value; in_gre_attach(sc); } int in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *dst, *src; struct ip *ip; int error; /* NOTE: we are protected with gre_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(*src)) break; if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } if (V_ipv4_hashtbl == NULL) V_ipv4_hashtbl = gre_hashinit(); error = in_gre_checkdup(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip = malloc(sizeof(struct greip) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip->ip_src.s_addr = src->sin_addr.s_addr; ip->ip_dst.s_addr = dst->sin_addr.s_addr; if (sc->gre_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); GRE_WAIT(); free(sc->gre_hdr, M_GRE); /* XXX: should we notify about link state change? */ } sc->gre_family = AF_INET; sc->gre_hdr = ip; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; in_gre_attach(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gre_family != AF_INET) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); src->sin_addr = (cmd == SIOCGIFPSRCADDR) ? sc->gre_oip.ip_src: sc->gre_oip.ip_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in_gre_output(struct mbuf *m, int af, int hlen) { struct greip *gi; gi = mtod(m, struct greip *); switch (af) { case AF_INET: /* * gre_transmit() has used M_PREPEND() that doesn't guarantee * m_data is contiguous more than hlen bytes. Use m_copydata() * here to avoid m_pullup(). */ m_copydata(m, hlen + offsetof(struct ip, ip_tos), sizeof(u_char), &gi->gi_ip.ip_tos); m_copydata(m, hlen + offsetof(struct ip, ip_id), sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id); break; #ifdef INET6 case AF_INET6: gi->gi_ip.ip_tos = 0; /* XXX */ ip_fillid(&gi->gi_ip); break; #endif } gi->gi_ip.ip_ttl = V_ip_gre_ttl; gi->gi_ip.ip_len = htons(m->m_pkthdr.len); return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL)); } static const struct encaptab *ecookie = NULL; static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_GRE, .min_length = sizeof(struct greip) + sizeof(struct ip), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in_gre_lookup, .input = gre_input }; void in_gre_init(void) { if (!IS_DEFAULT_VNET(curvnet)) return; ecookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); } void in_gre_uninit(void) { if (IS_DEFAULT_VNET(curvnet)) ip_encap_detach(ecookie); if (V_ipv4_hashtbl != NULL) gre_hashdestroy(V_ipv4_hashtbl); } Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 335923) +++ head/sys/netinet/raw_ip.c (revision 335924) @@ -1,1147 +1,1150 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip_mrouter); /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ extern struct protosw inetsw[]; u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) #ifdef INET static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); CK_LIST_REMOVE(inp, inp_hash); } #endif /* INET */ /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE static void rip_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ripcbinfo); } VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); #endif #ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv4)) { if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0) policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; + struct epoch_tracker et; int hash; *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; ifp = m->m_pkthdr.rcvif; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); last = NULL; } INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) goto skip_1; if (jailed_without_vnet(inp->inp_cred)) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) goto skip_1; } last = inp; continue; skip_1: INP_RUNLOCK(inp); } CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (!in_nullhost(inp->inp_laddr) && !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); last = NULL; } INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) goto skip_2; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) goto skip_2; } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. * * In the case of IGMPv2, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. imo_multi_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if (proto != IPPROTO_IGMP) { struct sockaddr_in group; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, (struct sockaddr *)&group, (struct sockaddr *)&ripsrc); } if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); goto skip_2; } } last = inp; continue; skip_2: INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { IPSTAT_INC(ips_noproto); IPSTAT_DEC(ips_delivered); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); } else { m_freem(m); } } return (IPPROTO_DONE); } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); va_list ap; u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; va_start(ap, so); dst = va_arg(ap, u_long); va_end(ap); /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = htons(IP_DF); else ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not * want to see from jails. */ if (ip->ip_src.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, inp->inp_cred); } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); error = prison_check_ip4(inp->inp_cred, &ip->ip_src); if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ntohs(ip->ip_len) != m->m_pkthdr.len) || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } /* * This doesn't allow application to specify ID of zero, * but we got this limitation from the beginning of history. */ if (ip->ip_id == 0) ip_fillid(ip); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IPSTAT_INC(ips_rawout); } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. * * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp->inp_inc.inc_fibnum = so->so_fibnum; return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); ifa_free(&ia->ia_ifa); break; } } if (ia == NULL) /* If ia matched, already unlocked. */ IN_IFADDR_RUNLOCK(&in_ifa_tracker); break; case PRC_IFUP: IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); ifa_free(&ia->ia_ifa); break; } } static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); rip_dodisconnect(so, inp); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); rip_dodisconnect(so, inp); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); rip_dodisconnect(so, inp); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; int error; if (nam->sa_len != sizeof(*addr)) return (EINVAL); error = prison_check_ip4(td->td_ucred, &addr->sin_addr); if (error != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); if (CK_STAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } #endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); if (!error) { + struct epoch_tracker et; /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; #endif /* INET */ Index: head/sys/netinet/tcp_hpts.c =================================================================== --- head/sys/netinet/tcp_hpts.c (revision 335923) +++ head/sys/netinet/tcp_hpts.c (revision 335924) @@ -1,1902 +1,1905 @@ /*- * Copyright (c) 2016-2018 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" /** * Some notes about usage. * * The tcp_hpts system is designed to provide a high precision timer * system for tcp. Its main purpose is to provide a mechanism for * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * * First, and probably the main thing its used by Rack and BBR for, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The * slot is the time from now that the stack wants to be called but it * must be converted to tcp_hpts's notion of slot. This is done with * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical * call from the tcp_output() routine might look like: * * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); * * The above would schedule tcp_ouput() to be called in 550 useconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: * * if (inp->inp_in_hpts) * return; * * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. * * Now the tcp_hpts system will call tcp_output in one of two forms, * it will first check to see if the stack as defined a * tfb_tcp_output_wtime() function, if so that is the routine it * will call, if that function is not defined then it will call the * tfb_tcp_output() function. The only difference between these * two calls is that the former passes the time in to the function * so the function does not have to access the time (which tcp_hpts * already has). What these functions do is of course totally up * to the individual tcp stack. * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort * a connection (later) or process input on a connection. * Why would you want to do this? To keep processor locality. * * So in order to use the input redirection function the * stack changes its tcp_do_segment() routine to instead * of process the data call the function: * * tcp_queue_pkt_to_input() * * You will note that the arguments to this function look * a lot like tcp_do_segments's arguments. This function * will assure that the tcp_hpts system will * call the functions tfb_tcp_hpts_do_segment() from the * correct CPU. Note that multiple calls can get pushed * into the tcp_hpts system this will be indicated by * the next to last argument to tfb_tcp_hpts_do_segment() * (nxt_pkt). If nxt_pkt is a 1 then another packet is * coming. If nxt_pkt is a 0 then this is the last call * that the tcp_hpts system has available for the tcp stack. * * The other point of the input system is to be able to safely * drop a tcp connection without worrying about the recursive * locking that may be occuring on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) * * To schedule the tcp_hpts system to call * * tcp_drop(tp, drop_reason) * * at a future point. This is quite handy to prevent locking * issues when dropping connections. * */ #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef tcpdebug #include #endif /* tcpdebug */ #ifdef tcp_offload #include #endif #ifdef ipsec #include #include #endif /* ipsec */ #include "opt_rss.h" MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 0; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); static struct tcp_hptsi tcp_pace; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int32_t tcp_hpts_callout_skip_swi = 0; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) static int32_t logging_on = 0; static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, &logging_on, 0, "Turn on logging if compiled in"); counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); static int32_t in_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, &in_newts_every_tcb, 0, "Do we have a new cts every tcb we process for input"); static int32_t in_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, &in_ts_percision, 0, "Do we use percise timestamp for clients on input"); static int32_t out_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, &out_newts_every_tcb, 0, "Do we have a new cts every tcb we process for output"); static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, &hpts_sleep_max, 0, "The maximum time the hpts will sleep <1 - 254>"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, "The minimum time the hpts must sleep before processing more slots"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, &tcp_hpts_callout_skip_swi, 0, "Do we have the callout call directly to the hpts?"); static void __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, uint32_t ticknow, int32_t line) { struct hpts_log *pl; HPTS_MTX_ASSERT(hpts); if (hpts->p_log == NULL) return; pl = &hpts->p_log[hpts->p_log_at]; hpts->p_log_at++; if (hpts->p_log_at >= hpts->p_logsize) { hpts->p_log_at = 0; hpts->p_log_wrapped = 1; } pl->inp = inp; if (inp) { pl->t_paceslot = inp->inp_hptsslot; pl->t_hptsreq = inp->inp_hpts_request; pl->p_onhpts = inp->inp_in_hpts; pl->p_oninput = inp->inp_in_input; } else { pl->t_paceslot = 0; pl->t_hptsreq = 0; pl->p_onhpts = 0; pl->p_oninput = 0; } pl->is_notempty = 1; pl->event = event; pl->line = line; pl->cts = tcp_get_usecs(NULL); pl->p_curtick = hpts->p_curtick; pl->p_prevtick = hpts->p_prevtick; pl->p_on_queue_cnt = hpts->p_on_queue_cnt; pl->ticknow = ticknow; pl->slot_req = slot; pl->p_nxt_slot = hpts->p_nxt_slot; pl->p_cur_slot = hpts->p_cur_slot; pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; pl->p_flags = (hpts->p_cpu & 0x7f); pl->p_flags <<= 7; pl->p_flags |= (hpts->p_num & 0x7f); pl->p_flags <<= 2; if (hpts->p_hpts_active) { pl->p_flags |= HPTS_HPTS_ACTIVE; } } #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) static void hpts_timeout_swi(void *arg) { struct tcp_hpts_entry *hpts; hpts = (struct tcp_hpts_entry *)arg; swi_sched(hpts->ie_cookie, 0); } static void hpts_timeout_dir(void *arg) { tcp_hpts_thread(arg); } static inline void hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_hpts == 0) { /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } if (TAILQ_EMPTY(head) && (hpts->p_on_queue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p hpts bucket empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_queue_cnt); } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; if (hpts->p_on_queue_cnt < 0) { /* Count should not go negative .. */ #ifdef INVARIANTS panic("Hpts goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_queue_cnt = 0; } if (clear) { inp->inp_hpts_request = 0; inp->inp_in_hpts = 0; } } static inline void hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if ((noref == 0) && (inp->inp_in_hpts == 1)) { /* We are already on the hpts? */ panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(head, inp, inp_hpts); inp->inp_in_hpts = 1; hpts->p_on_queue_cnt++; if (noref == 0) { in_pcbref(inp); } } static inline void hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 0) { /* We are not on the input hpts? */ panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_REMOVE(&hpts->p_input, inp, inp_input); hpts->p_on_inqueue_cnt--; if (hpts->p_on_inqueue_cnt < 0) { #ifdef INVARIANTS panic("Hpts in goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_inqueue_cnt = 0; } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p in_hpts input empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); } #endif if (clear) inp->inp_in_input = 0; } static inline void hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 1) { /* We are already on the input hpts? */ panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); inp->inp_in_input = 1; hpts->p_on_inqueue_cnt++; in_pcbref(inp); } static int sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) { struct tcp_hpts_entry *hpts; size_t sz; int32_t logging_was, i; int32_t error = 0; /* * HACK: Turn off logging so no locks are required this really needs * a memory barrier :) */ logging_was = logging_on; logging_on = 0; if (!req->oldptr) { /* How much? */ sz = 0; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; sz += (sizeof(struct hpts_log) * hpts->p_logsize); } error = SYSCTL_OUT(req, 0, sz); } else { for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; if (hpts->p_log_wrapped) sz = (sizeof(struct hpts_log) * hpts->p_logsize); else sz = (sizeof(struct hpts_log) * hpts->p_log_at); error = SYSCTL_OUT(req, hpts->p_log, sz); } } logging_on = logging_was; return error; } SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } static void tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } struct tcp_hpts_entry * tcp_cur_hpts(struct inpcb *inp) { int32_t hpts_num; struct tcp_hpts_entry *hpts; hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; return (hpts); } struct tcp_hpts_entry * tcp_hpts_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_hpts_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } struct tcp_hpts_entry * tcp_input_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_input_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_input_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } static void tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { int32_t add_freed; if (inp->inp_flags2 & INP_FREED) { /* * Need to play a special trick so that in_pcbrele_wlocked * does not return 1 when it really should have returned 0. */ add_freed = 1; inp->inp_flags2 &= ~INP_FREED; } else { add_freed = 0; } #ifndef INP_REF_DEBUG if (in_pcbrele_wlocked(inp)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #else if (__in_pcbrele_wlocked(inp, line)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #endif if (add_freed) { inp->inp_flags2 |= INP_FREED; } } static void tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { if (inp->inp_in_hpts) { hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); tcp_remove_hpts_ref(inp, hpts, line); } } static void tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input) { hpts_sane_input_remove(hpts, inp, 1); tcp_remove_hpts_ref(inp, hpts, line); } } /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key * but the lock order allows us to hold the * INP lock and then get the hpts lock. * * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. * Note that you can or both values together and get two * actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); if (flags & HPTS_REMOVE_OUTPUT) { hpts = tcp_hpts_lock(inp); tcp_hpts_remove_locked_output(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } if (flags & HPTS_REMOVE_INPUT) { hpts = tcp_input_lock(inp); tcp_hpts_remove_locked_input(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } } static inline int hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) { return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); } static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { int32_t need_wake = 0; uint32_t ticknow = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ if (hpts->p_hpts_active == 0) { /* A sleeping hpts we want in next slot to run */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, hpts_tick(hpts, 1)); } inp->inp_hptsslot = hpts_tick(hpts, 1); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); } need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* * We can't allow you to go into the same slot we * are in. We must put you out. */ inp->inp_hptsslot = hpts->p_nxt_slot; } else inp->inp_hptsslot = hpts->p_cur_slot; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } } return (need_wake); } int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) { int32_t ret; struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); mtx_unlock(&hpts->p_mtx); return (ret); } static void tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, struct hpts_diag *diag, int32_t noref) { int32_t need_new_to = 0; int32_t need_wakeup = 0; uint32_t largest_slot; uint32_t ticknow = 0; uint32_t slot_calc; HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; diag->slot_req = slot; } if ((inp->inp_in_hpts == 0) || noref) { inp->inp_hpts_request = slot; if (slot == 0) { /* Immediate */ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); return; } if (hpts->p_hpts_active) { /* * Its slot - 1 since nxt_slot is the next tick that * will go off since the hpts is awake */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); } /* * We want to make sure that we don't place a inp in * the range of p_cur_slot <-> p_nxt_slot. If we * take from p_nxt_slot to the end, plus p_cur_slot * and then take away 2, we will know how many is * the max slots we can use. */ if (hpts->p_nxt_slot > hpts->p_cur_slot) { /* * Non-wrap case nxt_slot <-> cur_slot we * don't want to land in. So the diff gives * us what is taken away from the number of * slots. */ largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { largest_slot = NUM_OF_HPTSI_SLOTS - 2; } else { /* * Wrap case so the diff gives us the number * of slots that we can land in. */ largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; } /* * We take away two so we never have a problem (20 * usec's) out of 1024000 usecs */ largest_slot -= 2; if (inp->inp_hpts_request > largest_slot) { /* * Restrict max jump of slots and remember * leftover */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* This one will run when we hit it */ inp->inp_hpts_request = 0; } if (hpts->p_nxt_slot == hpts->p_cur_slot) slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; else slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; if (slot_calc == hpts->p_cur_slot) { #ifdef INVARIANTS /* TSNH */ panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", hpts, slot_calc, slot, largest_slot); #endif if (slot_calc) slot_calc--; else slot_calc = NUM_OF_HPTSI_SLOTS - 1; } inp->inp_hptsslot = slot_calc; if (diag) { diag->inp_hptsslot = inp->inp_hptsslot; } } else { /* * The hpts is sleeping, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; uint32_t slot_now; struct timeval tv; ticknow = tcp_gethptstick(&tv); slot_now = ticknow % NUM_OF_HPTSI_SLOTS; /* * The user wants to be inserted at (slot_now + * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. */ largest_slot = NUM_OF_HPTSI_SLOTS - 2; if (inp->inp_hpts_request > largest_slot) { /* Adjust the residual in inp_hpts_request */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* No residual it all fits */ inp->inp_hpts_request = 0; } inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; if (diag) { diag->slot_now = slot_now; diag->inp_hptsslot = inp->inp_hptsslot; diag->p_on_min_sleep = hpts->p_on_min_sleep; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); } /* Now do we need to restart the hpts's timer? */ if (TSTMP_GT(ticknow, hpts->p_curtick)) have_slept = ticknow - hpts->p_curtick; else have_slept = 0; if (have_slept < hpts->p_hpts_sleep_time) { /* This should be what happens */ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; } else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; } if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { /* * We need to reschedule the hptss time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; } } else if (need_new_to) { int32_t co_ret; struct timeval tv; sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; while (need_new_to > HPTS_USEC_IN_SEC) { tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } tv.tv_usec = need_new_to; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } } else { #ifdef INVARIANTS panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); #endif } } uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ struct tcp_hpts_entry *hpts; uint32_t slot_on, cts; struct timeval tv; /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if (in_ts_percision) microuptime(&tv); else getmicrouptime(&tv); cts = tcp_tv_to_usectick(&tv); tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); } uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { int32_t retval = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); retval = 1; if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { retval = 4; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } return (retval); } void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked) + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) { /* Setup packet for input first */ INP_WLOCK_ASSERT(tp->t_inpcb); m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); m->m_pkthdr.pace_tlen = (uint16_t) tlen; m->m_pkthdr.pace_drphdrlen = drop_hdrlen; m->m_pkthdr.pace_tos = iptos; - m->m_pkthdr.pace_lock = (uint8_t) ti_locked; + m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); if (tp->t_in_pkt == NULL) { tp->t_in_pkt = m; tp->t_tail_pkt = m; } else { tp->t_tail_pkt->m_nextpkt = m; tp->t_tail_pkt = m; } } int32_t __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){ + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ struct tcp_hpts_entry *hpts; int32_t ret; - tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked); + tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); hpts = tcp_input_lock(tp->t_inpcb); ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) { struct tcp_hpts_entry *hpts; struct tcpcb *tp; tp = intotcpcb(inp); hpts = tcp_input_lock(tp->t_inpcb); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } inp->inp_hpts_drop_reas = reason; mtx_unlock(&hpts->p_mtx); } static uint16_t hpts_random_cpu(struct inpcb *inp){ /* * No flow type set distribute the load randomly. */ uint16_t cpuid; uint32_t ran; /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* Nothing set use a random number */ ran = arc4random(); cpuid = (ran & 0xffff) % mp_ncpus; return (cpuid); } static uint16_t hpts_cpuid(struct inpcb *inp){ uint16_t cpuid; /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (hpts_random_cpu(inp)); else return (cpuid); #else /* * We don't have a flowid -> cpuid mapping, so cheat and just map * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ if (inp->inp_flowtype != M_HASHTYPE_NONE) { cpuid = inp->inp_flowid % mp_ncpus; return (cpuid); } cpuid = hpts_random_cpu(inp); return (cpuid); #endif } /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary * list (e.g. TAILQ_CONCAT). If you do that the subtle * interactions of switching CPU's will kill because of * problems in the linked list manipulation. Basically * you would switch cpu's with the hpts mutex locked * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it * on the new hptss input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. * * You will die in an ASSERT for tailq corruption if you * run INVARIANTS or you will die horribly without * INVARIANTS in some unknown way with a corrupt linked * list. */ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; int32_t ti_locked = TI_UNLOCKED; + struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); if (inp->inp_input_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts->p_inp = inp; drop_reason = inp->inp_hpts_drop_reas; inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); CURVNET_SET(inp->inp_vnet); if (drop_reason) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else { ti_locked = TI_UNLOCKED; } INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } ti_locked = TI_UNLOCKED; CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out; } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ m = tp->t_in_pkt; if (m) n = m->m_nextpkt; else n = NULL; tp->t_in_pkt = NULL; while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } tp = tcp_drop(tp, drop_reason); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } if (set_cpu) { /* * Setup so the next time we will move to the right * CPU. This should be a rare event. It will * sometimes happens when we are the client side * (usually not the server). Somehow tcp_output() * gets called before the tcp_do_segment() sets the * intial state. This means the r_cpu and r_hpts_cpu * is 0. We get on the hpts, and then tcp_input() * gets called setting up the r_cpu to the correct * value. The hpts goes off and sees the mis-match. * We simply correct it here and the CPU will switch * to the new hpts nextime the tcb gets added to the * the hpts (not this time) :-) */ tcp_set_hpts(inp); } m = tp->t_in_pkt; n = NULL; if (m != NULL && (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); m = tp->t_in_pkt; } if (in_newts_every_tcb) { if (in_ts_percision) microuptime(tv); else getmicrouptime(tv); } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } /* Any input work to do, if so do it first */ if ((m != NULL) && (m == tp->t_in_pkt)) { struct tcphdr *th; int32_t tlen, drop_hdrlen, nxt_pkt; uint8_t iptos; n = m->m_nextpkt; tp->t_in_pkt = tp->t_tail_pkt = NULL; while (m) { th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); tlen = m->m_pkthdr.pace_tlen; drop_hdrlen = m->m_pkthdr.pace_drphdrlen; iptos = m->m_pkthdr.pace_tos; m->m_nextpkt = NULL; if (n) nxt_pkt = 1; else nxt_pkt = 0; inp->inp_input_calls = 1; if (tp->t_fb->tfb_tcp_hpts_do_segment) { /* Use the hpts specific do_segment */ (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, - tlen, iptos, ti_locked, nxt_pkt, tv); + tlen, iptos, nxt_pkt, tv); } else { /* Use the default do_segment */ (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, - tlen, iptos, ti_locked); + tlen, iptos); } + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); /* * Do segment returns unlocked we need the * lock again but we also need some kasserts * here. */ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); m = n; if (m) n = m->m_nextpkt; if (m != NULL && m->m_pkthdr.pace_lock == TI_RLOCKED) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; INP_WLOCK(inp); /* * Since we have an opening here we must * re-check if the tcb went away while we * were getting the lock(s). */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } goto out; } /* * Now that we hold the INP lock, check if * we need to upgrade our lock. */ if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; CURVNET_RESTORE(); } } static int tcp_hpts_est_run(struct tcp_hpts_entry *hpts) { int32_t ticks_to_run; if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { ticks_to_run = hpts->p_curtick - hpts->p_prevtick; if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; } } else { if (hpts->p_prevtick == hpts->p_curtick) { /* This happens when we get woken up right away */ return (-1); } ticks_to_run = 1; } /* Set in where we will be when we catch up */ hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; if (hpts->p_nxt_slot == hpts->p_cur_slot) { panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); } return (ticks_to_run); } static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) { struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; int32_t ticks_to_run, i, error, tick_now, interum_tick; int32_t paced_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; uint32_t cts; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); hpts->p_curtick = tcp_tv_to_hptstick(ctick); cts = tcp_tv_to_usectick(ctick); memcpy(&tv, ctick, sizeof(struct timeval)); hpts->p_cur_slot = hpts_tick(hpts, 1); /* Figure out if we had missed ticks */ again: HPTS_MTX_ASSERT(hpts); ticks_to_run = tcp_hpts_est_run(hpts); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); /* Reset the ticks to run and time if we need too */ interum_tick = tcp_gethptstick(&tv); if (interum_tick != hpts->p_curtick) { /* Save off the new time we execute to */ *ctick = tv; hpts->p_curtick = interum_tick; cts = tcp_tv_to_usectick(&tv); hpts->p_cur_slot = hpts_tick(hpts, 1); ticks_to_run = tcp_hpts_est_run(hpts); } if (ticks_to_run == -1) { goto no_run; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there * was not any */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* For debugging */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); } hpts->p_inp = inp; paced_cnt++; if (hpts->p_cur_slot != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); } /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; } if (inp->inp_hpts_request) { /* * This guy is deferred out further in time * then our wheel had on it. Push him back * on the wheel. */ int32_t remaining_slots; remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* * Keep INVARIANTS happy by clearing * the flag */ tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; } /* * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. */ inp->inp_in_hpts = 0; mtx_unlock(&hpts->p_mtx); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out_now; } if (set_cpu) { /* * Setup so the next time we will move to * the right CPU. This should be a rare * event. It will sometimes happens when we * are the client side (usually not the * server). Somehow tcp_output() gets called * before the tcp_do_segment() sets the * intial state. This means the r_cpu and * r_hpts_cpu is 0. We get on the hpts, and * then tcp_input() gets called setting up * the r_cpu to the correct value. The hpts * goes off and sees the mis-match. We * simply correct it here and the CPU will * switch to the new hpts nextime the tcb * gets added to the the hpts (not this one) * :-) */ tcp_set_hpts(inp); } if (out_newts_every_tcb) { struct timeval sv; if (out_ts_percision) microuptime(&sv); else getmicrouptime(&sv); cts = tcp_tv_to_usectick(&sv); } CURVNET_SET(inp->inp_vnet); /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make * sure we can get the INP we need to hold the p_mtx * above while we pull out the tp/inp, as long as * fini gets the lock first we are assured of having * a sane INP we can lock and test. */ #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } inp->inp_hpts_calls = 1; if (tp->t_fb->tfb_tcp_output_wtime != NULL) { error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); } else { error = tp->t_fb->tfb_tcp_output(tp); } if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can * prefetch its ppcb. Note this may seem * "risky" since we have no locks (other * than the previous inp) and there no * assurance that ninp was not pulled while * we were processing inp and freed. If this * occured it could mean that either: * * a) Its NULL (which is fine we won't go * here) b) Its valid (which is cool we * will prefetch it) c) The inp got * freed back to the slab which was * reallocated. Then the piece of memory was * re-used and something else (not an * address) is in inp_ppcb. If that occurs * we don't crash, but take a TLB shootdown * performance hit (same as if it was NULL * and we tried to pre-fetch it). * * Considering that the likelyhood of is * quite rare we will take a risk on doing * this. If performance drops after testing * we can always take this out. NB: the * kern_prefetch on amd64 actually has * protection against a bad address now via * the DMAP_() tests. This will prevent the * TLB hit, and instead if occurs just * cause us to load cache with a useless * address (to us). */ kern_prefetch(ninp->inp_ppcb, &prefetch_tp); prefetch_tp = 1; } INP_WUNLOCK(inp); INP_UNLOCK_ASSERT(inp); CURVNET_RESTORE(); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; hpts->p_cur_slot++; if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { hpts->p_cur_slot = 0; } } no_one: HPTS_MTX_ASSERT(hpts); hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ /* Re-run any input that may be there */ (void)tcp_gethptstick(&tv); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif tick_now = tcp_gethptstick(&tv); if (SEQ_GT(tick_now, hpts->p_prevtick)) { struct timeval res; /* Did we really spend a full tick or more in here? */ timersub(&tv, ctick, &res); if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { counter_u64_add(hpts_loops, 1); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); } *ctick = res; hpts->p_curtick = tick_now; goto again; } } no_run: { uint32_t t = 0, i, fnd = 0; if (hpts->p_on_queue_cnt) { /* * Find next slot that is occupied and use that to * be the sleep time. */ for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; } t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { hpts->p_hpts_sleep_time = i; } else { counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); #endif hpts->p_on_queue_cnt = 0; goto non_found; } t++; } else { /* No one on the wheel sleep for all but 2 slots */ non_found: if (hpts_sleep_max == 0) hpts_sleep_max = 1; hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); t = 0; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); } } } void __tcp_set_hpts(struct inpcb *inp, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if ((inp->inp_in_hpts == 0) && (inp->inp_hpts_cpu_set == 0)) { inp->inp_hpts_cpu = hpts_cpuid(inp); inp->inp_hpts_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if ((inp->inp_input_cpu_set == 0) && (inp->inp_in_input == 0)) { inp->inp_input_cpu = hpts_cpuid(inp); inp->inp_input_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); } uint16_t tcp_hpts_delayedby(struct inpcb *inp){ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); } static void tcp_hpts_thread(void *ctx) { struct tcp_hpts_entry *hpts; struct timeval tv; sbintime_t sb; hpts = (struct tcp_hpts_entry *)ctx; mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } hpts->p_hpts_active = 1; (void)tcp_gethptstick(&tv); tcp_hptsi(hpts, &tv); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ hpts->p_on_min_sleep = 0; } hpts->p_hpts_active = 0; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } hpts->p_direct_wake = 0; mtx_unlock(&hpts->p_mtx); } #undef timersub static void tcp_init_hptsi(void *st) { int32_t i, j, error, bound = 0, created = 0; size_t sz, asz; struct timeval tv; sbintime_t sb; struct tcp_hpts_entry *hpts; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); hpts = tcp_pace.rp_ent[i]; /* * Init all the hpts structures that are not specifically * zero'd by the allocations. Also lets attach them to the * appropriate sysctl block as well. */ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", MTX_DEF | MTX_DUPOK); TAILQ_INIT(&hpts->p_input); for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { TAILQ_INIT(&hpts->p_hptss[j]); } sysctl_ctx_init(&hpts->hpts_ctx); sprintf(unit, "%d", i); hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), OID_AUTO, unit, CTLFLAG_RW, 0, ""); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "in_qcnt", CTLFLAG_RD, &hpts->p_on_inqueue_cnt, 0, "Count TCB's awaiting input processing"); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, "Is the hpts active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, "What the current slot is if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, "What the current tick on if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "logsize", CTLFLAG_RD, &hpts->p_logsize, 0, "Hpts logging buffer size"); hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; hpts->p_num = i; hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_prevtick -= 1; hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; hpts->p_cpu = 0xffff; hpts->p_nxt_slot = 1; hpts->p_logsize = tcp_hpts_logging_size; if (hpts->p_logsize) { sz = (sizeof(struct hpts_log) * hpts->p_logsize); hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); } callout_init(&hpts->co, 1); } /* * Now lets start ithreads to handle the hptss. */ CPU_FOREACH(i) { hpts = tcp_pace.rp_ent[i]; hpts->p_cpu = i; error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); if (error) { panic("Can't add hpts:%p i:%d err:%d", hpts, i, error); } created++; if (tcp_bind_threads) { if (intr_event_bind(hpts->ie, i) == 0) bound++; } tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } } printf("TCP Hpts created %d swi interrupt thread and bound %d\n", created, bound); return; } SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); MODULE_VERSION(tcphpts, 1); Index: head/sys/netinet/tcp_hpts.h =================================================================== --- head/sys/netinet/tcp_hpts.h (revision 335923) +++ head/sys/netinet/tcp_hpts.h (revision 335924) @@ -1,304 +1,304 @@ /*- * Copyright (c) 2016-2018 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_hpts_h__ #define __tcp_hpts_h__ /* * The hpts uses a 102400 wheel. The wheel * defines the time in 10 usec increments (102400 x 10). * This gives a range of 10usec - 1024ms to place * an entry within. If the user requests more than * 1.024 second, a remaineder is attached and the hpts * when seeing the remainder will re-insert the * inpcb forward in time from where it is until * the remainder is zero. */ #define NUM_OF_HPTSI_SLOTS 102400 TAILQ_HEAD(hptsh, inpcb); /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 #define HPTS_MS_TO_SLOTS(x) (x * 100) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 #define DEFAULT_HPTS_LOG 3072 /* * Log flags consist of * 7f 7f 1 1 bits * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE * * So for example cpu 10, number 10 would with * input active would show up as: * p_flags = 0001010 0001010 1 0 * * p_flags = 0x142a */ #define HPTS_HPTS_ACTIVE 0x01 #define HPTS_INPUT_ACTIVE 0x02 #define HPTSLOG_IMMEDIATE 1 #define HPTSLOG_INSERT_NORMAL 2 #define HPTSLOG_INSERT_SLEEPER 3 #define HPTSLOG_SLEEP_AFTER 4 #define HPTSLOG_SLEEP_BEFORE 5 #define HPTSLOG_INSERTED 6 #define HPTSLOG_WAKEUP_HPTS 7 #define HPTSLOG_SETTORUN 8 #define HPTSLOG_HPTSI 9 #define HPTSLOG_TOLONG 10 #define HPTSLOG_AWAKENS 11 #define HPTSLOG_TIMESOUT 12 #define HPTSLOG_SLEEPSET 13 #define HPTSLOG_WAKEUP_INPUT 14 #define HPTSLOG_RESCHEDULE 15 #define HPTSLOG_AWAKE 16 #define HPTSLOG_INP_DONE 17 struct hpts_log { struct inpcb *inp; int32_t event; uint32_t cts; int32_t line; uint32_t ticknow; uint32_t t_paceslot; uint32_t t_hptsreq; uint32_t p_curtick; uint32_t p_prevtick; uint32_t slot_req; uint32_t p_on_queue_cnt; uint32_t p_nxt_slot; uint32_t p_cur_slot; uint32_t p_hpts_sleep_time; uint16_t p_flags; uint8_t p_onhpts; uint8_t p_oninput; uint8_t is_notempty; }; struct hpts_diag { uint32_t p_hpts_active; uint32_t p_nxt_slot; uint32_t p_cur_slot; uint32_t slot_req; uint32_t inp_hptsslot; uint32_t slot_now; uint32_t have_slept; uint32_t hpts_sleep_time; uint32_t yet_to_sleep; uint32_t need_new_to; int32_t co_ret; uint8_t p_on_min_sleep; }; #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ uint32_t p_hpts_active; /* Flag that says hpts is awake */ uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ uint32_t enobuf_cnt; uint16_t p_log_at; uint8_t p_direct_wake :1, /* boolean */ p_log_wrapped :1, /* boolean */ p_on_min_sleep:1; /* boolean */ uint8_t p_fill; /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; struct hpts_log *p_log; uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; struct sysctl_oid *hpts_root; struct intr_event *ie; void *ie_cookie; uint16_t p_num; /* The hpts number one per cpu */ uint16_t p_cpu; /* The hpts CPU */ /* There is extra space in here */ /* Cache line 0x100 */ struct callout co __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); struct tcp_hptsi { struct proc *rp_proc; /* Process structure for hpts */ struct tcp_hpts_entry **rp_ent; /* Array of hptss */ uint32_t rp_num_hptss; /* Number of hpts threads */ }; #endif #define HPTS_REMOVE_INPUT 0x01 #define HPTS_REMOVE_OUTPUT 0x02 #define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT) /* * When using the hpts, a TCP stack must make sure * that once a INP_DROPPED flag is applied to a INP * that it does not expect tcp_output() to ever be * called by the hpts. The hpts will *not* call * any output (or input) functions on a TCB that * is in the DROPPED state. * * This implies final ACK's and RST's that might * be sent when a TCB is still around must be * sent from a routine like tcp_respond(). */ #define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep * this determines min granularity of the * hpts. If 0, granularity is 10useconds at * the cost of more CPU (context switching). */ #ifdef _KERNEL #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp); struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp); int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line); #define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__) struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp); #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__) void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line); /* * To insert a TCB on the hpts you *must* be holding the * INP_WLOCK(). The hpts insert code will then acqurire * the hpts's lock and insert the TCB on the requested * slot possibly waking up the hpts if you are requesting * a time earlier than what the hpts is sleeping to (if * the hpts is sleeping). You may check the inp->inp_in_hpts * flag without the hpts lock. The hpts is the only one * that will clear this flag holding only the hpts lock. This * means that in your tcp_output() routine when you test for * it to be 1 (so you wont call output) it may be transitioning * to 0 (by the hpts). That will be fine since that will just * mean an extra call to tcp_output that most likely will find * the call you executed (when the mis-match occured) will have * put the TCB back on the hpts and it will return. If your * call did not add it back to the hpts then you will either * over-send or the cwnd will block you from sending more. * * Note you should also be holding the INP_WLOCK() when you * call the remove from the hpts as well. Thoug usually * you are either doing this from a timer, where you need * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line); #define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__) uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag); int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked); + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line); + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); #define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); void __tcp_set_hpts(struct inpcb *inp, int32_t line); #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line); #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__) extern int32_t tcp_min_hptsi_time; static __inline uint32_t tcp_tv_to_hptstick(struct timeval *sv) { return ((sv->tv_sec * 100000) + (sv->tv_usec / 10)); } static __inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; if (sv == NULL) sv = &tv; microuptime(sv); return (tcp_tv_to_hptstick(sv)); } static __inline uint32_t tcp_tv_to_usectick(struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } static __inline uint32_t tcp_tv_to_mssectick(struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } static __inline void tcp_hpts_unlock(struct tcp_hpts_entry *hpts) { mtx_unlock(&hpts->p_mtx); } static __inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; if (tv == NULL) tv = &tvd; microuptime(tv); return (tcp_tv_to_usectick(tv)); } #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 335923) +++ head/sys/netinet/tcp_input.c (revision 335924) @@ -1,3865 +1,3808 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include #include const int tcprexmtthresh = 3; int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc6675_pipe), 0, "Use calculated pipe/in-flight bytes per RFC 6675"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 2; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); static void tcp_vnet_init(const void *unused) { COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_init, NULL); #ifdef VIMAGE static void tcp_vnet_uninit(const void *unused) { COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); } VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_uninit, NULL); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { counter_u64_add(VNET(tcpstat)[statnum], 1); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tcp_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else if (V_tcp_initcwnd_segments) tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) tp->snd_cwnd = 2 * maxseg; else if (maxseg > 1095) tp->snd_cwnd = 3 * maxseg; else tp->snd_cwnd = 4 * maxseg; } if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { u_int maxseg; INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: maxseg = tcp_maxseg(tp); tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / maxseg) * maxseg; tp->snd_cwnd = maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) static void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->ccv->flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: tp->ccv->flags &= ~CCF_IPHDR_CE; break; case IPTOS_ECN_ECT1: tp->ccv->flags &= ~CCF_IPHDR_CE; break; } if (th->th_flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; if (tp->t_flags & TF_DELACK) tp->ccv->flags |= CCF_DELACK; else tp->ccv->flags &= ~CCF_DELACK; CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ifa_free(&ia6->ia_ifa); ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ int tcp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; + struct epoch_tracker et; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Locate pcb for segment; if we're likely to add or remove a * connection then first acquire pcbinfo lock. There are three cases * where we might discover later we need a write lock despite the * flags: ACKs moving a connection out of the syncache, ACKs for a * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: #ifdef INVARIANTS if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK_ASSERT(inp); /* * While waiting for inp lock during the lookup, another thread * can have dropped the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. */ if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); inp = NULL; goto findpcb; } if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && ((inp->inp_socket == NULL) || (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb * lock, and not the inpcbinfo write lock. If so, we need to try to * acquire it, or if that fails, acquire a reference on the inpcb, * drop all locks, acquire a global write lock, and then re-acquire * the inpcb lock. We may at that point discover that another thread * has tried to free the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. * * XXXRW: It may be time to rethink timewait locking. */ if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK(); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif /* * We've identified a valid inpcb, but it could be that we need an * inpcbinfo write lock but don't hold it. In this case, attempt to * acquire using the same strategy as the TIMEWAIT case above. If we * relock, we have to jump back to 'relocked' as the connection might * now be in TIMEWAIT. */ #ifdef INVARIANTS if ((thflags & (TH_FIN | TH_RST)) != 0) INP_INFO_RLOCK_ASSERT(&V_tcbinfo); #endif if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !IS_FASTOPEN(tp->t_flags)))) { if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK(); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), ("%s: so accepting but tp %p not listening", __func__, tp)); if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ rstreason = syncache_expand(&inc, &to, th, &so, m); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, - iptos, ti_locked); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + iptos); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (ia6) ifa_free(&ia6->ia_ifa); } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) INP_WUNLOCK(inp); drop: INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > (tp->t_srtt >> TCP_RTT_SHIFT)) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, - int ti_locked) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win; uint32_t tiwin; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); - /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " - "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; else if (tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) tcp_fastopen_update_cache(tp, to.to_mss, to.to_tfo_len, to.to_tfo_cookie); else tcp_fastopen_disable_path(tp); } } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " - "ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, - ("%s: TH_RST ti_locked %d, th %p tp %p", - __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { - KASSERT(ti_locked == TI_RLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " - "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to * regular ACK processing below. */ tp->snd_una++; } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) sack_changed = tcp_sack_doack(tp, &to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { u_int maxseg; maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || ((tp->t_flags & TF_SACK_PERMIT) && !sack_changed)) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. */ if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) tp->t_dupacks++; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " - "TCP_FIN_WAIT_2 ti_locked: %d", __func__, - ti_locked)); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); check_delack: - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - } -#ifdef INVARIANTS - else - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); -#endif - /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; if (th->th_flags & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } Index: head/sys/netinet/tcp_stacks/fastpath.c =================================================================== --- head/sys/netinet/tcp_stacks/fastpath.c (revision 335923) +++ head/sys/netinet/tcp_stacks/fastpath.c (revision 335924) @@ -1,2438 +1,2327 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2015 Netflix Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Portions of this software were developed by Randall R. Stewart while * working for Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); + struct socket *, struct tcpcb *, int, int, uint8_t); static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); + struct socket *, struct tcpcb *, int, int, uint8_t); /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * So how is this faster than the normal fast ack? * It basically allows us to also stay in the fastpath * when a window-update ack also arrives. In testing * we saw only 25-30% of connections doing fastpath * due to the fact that along with moving forward * in sequence the window was also updated. */ static void tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, - int ti_locked, uint32_t tiwin) + uint32_t tiwin) { int acked; uint16_t nsegs; int winup_only=0; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif + /* * The following if statement will be true if * we are doing the win_up_in_fp * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) * - No more new data, but we have an ack for new data * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) * - No more new data, the same ack point but the window grew * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) */ if ((SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static void tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, - int ti_locked, uint32_t tiwin) + uint32_t tiwin) { int newsize = 0; /* automatic sockbuf scaling */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } - /* - * This is a pure, in-sequence data packet with - * nothing on the reassembly queue and we have enough - * buffer space to take it. - */ - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * The slow-path is the clone of the long long part * of tcp_do_segment past all the fast-path stuff. We * use it here by two different callers, the fast/slow and * the fastack only. */ static void tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, - int ti_locked, uint32_t tiwin, int thflags) + uint32_t tiwin, int thflags) { int acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree = NULL; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ inc = &tp->t_inpcb->inp_inc; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); - switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " - "ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, - ("%s: TH_RST ti_locked %d, th %p tp %p", - __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { - KASSERT(ti_locked == TI_RLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " - "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to->to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { tp->snd_wnd -= sbavail(&so->so_snd); mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " - "TCP_FIN_WAIT_2 ti_locked: %d", __func__, - ti_locked)); - tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, m); - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - } -#ifdef INVARIANTS - else - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); -#endif - /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, m); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Do fast slow is a combination of the original * tcp_dosegment and a split fastpath, one function * for the fast-ack which also includes allowing fastpath * for window advanced in sequence acks. And also a * sub-function that handles the insequence data. */ void tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, - int ti_locked) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags; uint32_t tiwin; char *s; uint16_t nsegs; int can_enter; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; nsegs = max(1, m->m_pkthdr.lro_nsegs); /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " - "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } INP_WUNLOCK(tp->t_inpcb); return; } tp->sackhint.last_sack_ack = 0; /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } can_enter = 0; if (__predict_true((tlen == 0))) { /* * The ack moved forward and we have a window (non-zero) * * The ack did not move forward, but the window increased. */ if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { can_enter = 1; } } else { /* * Data incoming, use the old entry criteria * for fast-path with data. */ if ((tiwin && tiwin == tp->snd_wnd)) { can_enter = 1; } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && can_enter && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { if (__predict_true((tlen == 0) && (SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { /* We are done */ tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, - ti_locked, tiwin); + tiwin); return; } else if ((tlen) && (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv))) { tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, - ti_locked, tiwin); + tiwin); /* We are done */ return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, - ti_locked, tiwin, thflags); + tiwin, thflags); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, - int ti_locked, uint32_t tiwin) + uint32_t tiwin) { int acked; uint16_t nsegs; int winup_only=0; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(th->th_ack == tp->snd_una) && __predict_false(tiwin <= tp->snd_wnd)) { /* duplicate ack a shrinking dup ack with shrinking window */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } if (__predict_false(to->to_flags & TOF_SACK)) { /* Sack included in the ack.. */ return (0); } if (!TAILQ_EMPTY(&tp->snd_holes)) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } - ti_locked = TI_UNLOCKED; - TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* Wake up the socket if we have room to write more */ sowwakeup(so); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } if (sbavail(&so->so_snd)) (void) tcp_output(tp); - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return (1); } /* * This tcp-do-segment concentrates on making the fastest * ack processing path. It does not have a fast-path for * data (it possibly could which would then eliminate the * need for fast-slow above). For a content distributor having * large outgoing elephants and very very little coming in * having no fastpath for data does not really help (since you * don't get much data in). The most important thing is * processing ack's quickly and getting the rest of the data * output to the peer as quickly as possible. This routine * seems to be about an overall 3% faster then the old * tcp_do_segment and keeps us in the fast-path for packets * much more (by allowing window updates to also stay in the fastpath). */ void tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, - int ti_locked) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags; uint32_t tiwin; char *s; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " - "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - } INP_WUNLOCK(tp->t_inpcb); return; } tp->sackhint.last_sack_ack = 0; /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && __predict_true(((to.to_flags & TOF_SACK) == 0)) && __predict_true(tlen == 0) && __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && __predict_true(LIST_EMPTY(&tp->t_segq)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, - ti_locked, tiwin)) { + tiwin)) { return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, - ti_locked, tiwin, thflags); + tiwin, thflags); } struct tcp_function_block __tcp_fastslow = { .tfb_tcp_block_name = "fastslow", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment_fastslow, .tfb_tcp_ctloutput = tcp_default_ctloutput, }; struct tcp_function_block __tcp_fastack = { .tfb_tcp_block_name = "fastack", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment_fastack, .tfb_tcp_ctloutput = tcp_default_ctloutput }; static int tcp_addfastpaths(module_t mod, int type, void *data) { int err = 0; switch (type) { case MOD_LOAD: err = register_tcp_functions(&__tcp_fastack, M_WAITOK); if (err) { printf("Failed to register fastack module -- err:%d\n", err); return(err); } err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); if (err) { printf("Failed to register fastslow module -- err:%d\n", err); deregister_tcp_functions(&__tcp_fastack, false, true); return(err); } break; case MOD_QUIESCE: if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { return(EBUSY); } err = deregister_tcp_functions(&__tcp_fastack, true, false); err = deregister_tcp_functions(&__tcp_fastslow, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_fastack, false, true); err = deregister_tcp_functions(&__tcp_fastslow, false, true); if (err == EBUSY) break; err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t new_tcp_fastpaths = { .name = "tcp_fastpaths", .evhand = tcp_addfastpaths, .priv = 0 }; MODULE_VERSION(kern_tcpfastpaths, 1); DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); Index: head/sys/netinet/tcp_stacks/rack.c =================================================================== --- head/sys/netinet/tcp_stacks/rack.c (revision 335923) +++ head/sys/netinet/tcp_stacks/rack.c (revision 335924) @@ -1,9190 +1,9131 @@ /*- * Copyright (c) 2016-2018 * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #ifdef NETFLIX_STATS #include #endif #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #ifdef NETFLIX_CWV #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_rack.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; uma_zone_t rack_pcb_zone; #ifndef TICKS2SBT #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) #endif struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; #define CUM_ACKED 1 #define SACKED 2 /* * The RACK module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Matt Mathis's Rate Halving which slowly drops * the congestion window so that the ack clock can * be maintained during a recovery. * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Each * state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard. * */ static int32_t rack_precache = 1; static int32_t rack_tlp_thresh = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ static int32_t rack_pkt_delay = 1; static int32_t rack_inc_var = 0;/* For TLP */ static int32_t rack_reduce_largest_on_idle = 0; static int32_t rack_min_pace_time = 0; static int32_t rack_min_pace_time_seg_req=6; static int32_t rack_early_recovery = 1; static int32_t rack_early_recovery_max_seg = 6; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ static int32_t rack_rto_max = 30000; /* 30 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 1; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_lower_cwnd_at_tlp = 0; static int32_t rack_use_proportional_reduce = 0; static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; /* Rack specific counters */ counter_u64_t rack_badfr; counter_u64_t rack_badfr_bytes; counter_u64_t rack_rtm_prr_retran; counter_u64_t rack_rtm_prr_newdata; counter_u64_t rack_timestamp_mismatch; counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; counter_u64_t rack_tlp_newdata; counter_u64_t rack_tlp_retran; counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_tlp_retran_fail; counter_u64_t rack_to_tot; counter_u64_t rack_to_arm_rack; counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_runt_sacks; counter_u64_t rack_used_tlpmethod; counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_tlp_does_nada; /* Temp CPU counters */ counter_u64_t rack_find_high; counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); static void rack_counter_destroy(void); static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - uint8_t iptos, int32_t ti_locked); + uint8_t iptos); static void rack_dtor(void *mem, int32_t size, void *arg); static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts); static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm); static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp); static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); static int32_t rack_output(struct tcpcb *tp); static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); + uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts); static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta); static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); + struct tcpcb *tp, int32_t * ret_val); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void +rack_do_drop(struct mbuf *m, struct tcpcb *tp); +static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); + struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); + struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t nxt_pkt); static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, - int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); + int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_drop_checks(struct tcpopt *to, struct mbuf *m, - struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val); static int rack_process_rst(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, int32_t * ti_locked); + struct socket *so, struct tcpcb *tp); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); static int rack_ts_check(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); + struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); int32_t rack_clear_counter=0; static int sysctl_rack_clear(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef INVARIANTS printf("Clearing RACK counters\n"); #endif counter_u64_zero(rack_badfr); counter_u64_zero(rack_badfr_bytes); counter_u64_zero(rack_rtm_prr_retran); counter_u64_zero(rack_rtm_prr_newdata); counter_u64_zero(rack_timestamp_mismatch); counter_u64_zero(rack_reorder_seen); counter_u64_zero(rack_tlp_tot); counter_u64_zero(rack_tlp_newdata); counter_u64_zero(rack_tlp_retran); counter_u64_zero(rack_tlp_retran_bytes); counter_u64_zero(rack_tlp_retran_fail); counter_u64_zero(rack_to_tot); counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_find_high); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); } rack_clear_counter = 0; return (0); } static void rack_init_sysctls() { SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &rack_ignore_data_after_close, 0, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_time", CTLFLAG_RW, &rack_min_pace_time, 0, "Should we enforce a minimum pace time of 1ms"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_segs", CTLFLAG_RW, &rack_min_pace_time_seg_req, 6, "How many segments have to be in the len to enforce min-pace-time"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_high", CTLFLAG_RW, &rack_reduce_largest_on_idle, 0, "Should we reduce the largest cwnd seen to IW on idle reduction"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &rack_verbose_logging, 0, "Should RACK black box logging be verbose"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sackfiltering", CTLFLAG_RW, &rack_use_sack_filter, 1, "Do we use sack filtering?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "delayed_ack", CTLFLAG_RW, &rack_delayed_ack_time, 200, "Delayed ack time (200ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "precache", CTLFLAG_RW, &rack_precache, 0, "Where should we precache the mcopy (0 is not at all)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &rack_sack_block_limit, 128, "When do we start paying attention to small sack blocks"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 1, "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, &rack_tlp_in_recovery, 1, "Can we do a TLP during recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minrto", CTLFLAG_RW, &rack_rto_min, 0, "Minimum RTO in ms -- set with caution below 1000 due to TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "maxrto", CTLFLAG_RW, &rack_rto_max, 0, "Maxiumum RTO in ms -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retry", CTLFLAG_RW, &rack_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, &rack_use_proportional_reduce, 0, "Should we proportionaly reduce cwnd based on the number of losses "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_prop", CTLFLAG_RW, &rack_proportional_rate, 10, "What percent reduction per loss"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, &rack_lower_cwnd_at_tlp, 0, "When a TLP completes a retran should we enter recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_reduces", CTLFLAG_RW, &rack_slot_reduction, 4, "When setting a slot should we reduce by divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, &rack_pace_every_seg, 1, "Should we pace out every segment hptsi"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, &rack_hptsi_segments, 6, "Should we pace out only a limited size of segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prr_sendalot", CTLFLAG_RW, &rack_send_a_lot_in_prr, 1, "Send a lot in prr"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minto", CTLFLAG_RW, &rack_min_to, 1, "Minimum rack timeout in milliseconds"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, &rack_early_recovery_max_seg, 6, "Max segments in early recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecovery", CTLFLAG_RW, &rack_early_recovery, 1, "Do we do early recovery with rack"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &rack_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &rack_tlp_thresh, 1, "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_fade", CTLFLAG_RW, &rack_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, &rack_inc_var, 0, "Should rack add to the TLP timer the variance in rtt calculation"); rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired?"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed?"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed?"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total alocations done from emergency cache"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of runt sacks 2"); rack_runt_sacks = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "runtsacks", CTLFLAG_RD, &rack_runt_sacks, "Total number of runt sacks"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); SYSCTL_ADD_PROC(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* * There is an assumption that the caller * will drop the connection so we will * increment the counters here. */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); #endif rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } return (0); } static void rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = which; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false); } } static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false); } } static void rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, uint32_t o_srtt, uint32_t o_var) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; log.u_bbr.flex2 = o_srtt; log.u_bbr.flex3 = o_var; log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false); } } static void rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) { /* * Log the rtt sample we are * applying to the srtt algorithm in * useconds. */ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; /* Convert our ms to a microsecond */ log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &tv); } } static inline void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false); } } static void rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, 0, &log, false); } } static void rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, false); } } static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false); } } static void rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = 0; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = hpts_removed; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false); } } static void rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false); } } static void rack_counter_destroy() { counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); counter_u64_free(rack_rtm_prr_retran); counter_u64_free(rack_rtm_prr_newdata); counter_u64_free(rack_timestamp_mismatch); counter_u64_free(rack_reorder_seen); counter_u64_free(rack_tlp_tot); counter_u64_free(rack_tlp_newdata); counter_u64_free(rack_tlp_retran); counter_u64_free(rack_tlp_retran_bytes); counter_u64_free(rack_tlp_retran_fail); counter_u64_free(rack_to_tot); counter_u64_free(rack_to_arm_rack); counter_u64_free(rack_to_arm_tlp); counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); counter_u64_free(rack_find_high); counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_tlp_does_nada); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } static struct rack_sendmap * rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; counter_u64_add(rack_to_alloc, 1); rack->r_ctl.rc_num_maps_alloced++; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { return (rsm); } if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; return (rsm); } return (NULL); } static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { rack->r_ctl.rc_num_maps_alloced--; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_next == rsm) rack->r_ctl.rc_next = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt++; return; } uma_zfree(rack_zone, rsm); } /* * CC wrapper hook functions */ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery) { #ifdef NETFLIX_STATS int32_t gput; #endif #ifdef NETFLIX_CWV u_long old_cwnd = tp->snd_cwnd; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } } if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef NETFLIX_STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t) tp->snd_cwnd) - tp->snd_wnd); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an * API to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; #ifdef NETFLIX_CWV if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly * one update per round trip time. */ tcp_update_peakrate_thr(tp); } #endif } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tp->t_maxseg); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } #ifdef NETFLIX_CWV if (tp->cwv_enabled) { /* * Per RFC 7661: The behaviour in the non-validated phase is * specified as: o A sender determines whether to increase * the cwnd based upon whether it is cwnd-limited (see * Section 4.5.3): * A sender that is cwnd-limited MAY use * the standard TCP method to increase cwnd (i.e., the * standard method permits a TCP sender that fully utilises * the cwnd to increase the cwnd each time it receives an * ACK). * A sender that is not cwnd-limited MUST NOT * increase the cwnd when ACK packets are received in this * phase (i.e., needs to avoid growing the cwnd when it has * not recently sent using the current size of cwnd). */ if ((tp->snd_cwnd > old_cwnd) && (tp->cwv_cwnd_valid == 0) && (!(tp->ccv->flags & CCF_CWND_LIMITED))) { tp->snd_cwnd = old_cwnd; } /* Try to update pipeAck and NCWV state */ if (TCPS_HAVEESTABLISHED(tp->t_state) && !IN_RECOVERY(tp->t_flags)) { uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); tcp_newcwv_update_pipeack(tp, data); } } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } #endif } static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); if (rack->r_ctl.rc_prr_sndcnt > 0) rack->r_wanted_output++; } static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* * Here we can in theory adjust cwnd to be based on the number of * losses in the window (rack->r_ctl.rc_loss_count). This is done * based on the rack_use_proportional flag. */ if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { int32_t reduce; reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); if (reduce > 50) { reduce = 50; } tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); } else { if (tp->snd_cwnd > tp->snd_ssthresh) { /* Drop us down to the ssthresh (1/2 cwnd at loss) */ tp->snd_cwnd = tp->snd_ssthresh; } } if (rack->r_ctl.rc_prr_sndcnt > 0) { /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; } EXIT_RECOVERY(tp->t_flags); #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && (tp->snd_cwv.in_recovery)) tcp_newcwv_end_recovery(tp); } #endif } static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: /* rack->r_ctl.rc_ssthresh_set = 1;*/ if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_ctl.rc_loss_count = 0; rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg) * tp->t_maxseg; tp->snd_cwnd = tp->t_maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { tcp_newcwv_enter_recovery(tp); } if (type == CC_RTO) { tcp_newcwv_reset(tp); } } #endif } static inline void rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) { uint32_t i_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_idle_restarts); if (tp->t_state == TCPS_ESTABLISHED) TCPSTAT_INC(tcps_idle_estrestarts); #endif if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); if (tp->snd_cwnd == 1) i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else if (V_tcp_initcwnd_segments) i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) i_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) i_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) i_cwnd = 3 * tp->t_maxseg; else i_cwnd = 4 * tp->t_maxseg; } if (reduce_largest) { /* * Do we reduce the largest cwnd to make * rack play nice on restart hptsi wise? */ if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; } /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial * window. */ if (tp->snd_cwnd < i_cwnd) { tp->snd_cwnd = i_cwnd; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((tp->t_flags & TF_DELACK) == 0) && \ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) static inline void rack_calc_rwin(struct socket *so, struct tcpcb *tp) { int32_t win; /* * Calculate amount of space in receive window, and then do TCP * input processing. Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) +rack_do_drop(struct mbuf *m, struct tcpcb *tp) { - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } /* * Drop space held by incoming segment and return. */ if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (m) m_freem(m); } static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) { - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void -rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) +rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence * space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all paths to this * code happen after packets containing RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the segment * we received passes the SYN-RECEIVED ACK test. If it fails send a * RST. This breaks the loop in the "LAND" DoS attack, and also * prevents an ACK storm between two listening ports that have been * sent forged SYN segments, each with the source address of the * other. */ struct tcp_rack *rack; if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } else *ret_val = 0; - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } rack = (struct tcp_rack *)tp->t_fb_ptr; rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; if (m) m_freem(m); } static int -rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) +rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in * window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should test against * last_ack_sent instead of rcv_nxt. Note 2: we handle special case * of closed window, not covered by the RFC. */ int dropped = 0; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(*ti_locked == TI_RLOCKED, - ("%s: TH_RST ti_locked %d, th %p tp %p", - __func__, *ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || ((tp->last_ack_sent - 1) == th->th_seq)) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } dropped = 1; - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } else { m_freem(m); } return (dropped); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void -rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) +rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { - KASSERT(*ti_locked == TI_RLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); } } /* * rack_ts_check returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int -rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) +rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates ts_recent, * the age will be reset later and ts_recent will get a * valid value. If it does not, setting ts_recent to zero * will at least satisfy the requirement that zero be placed * in the timestamp echo reply when ts_recent isn't valid. * The age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be dropped * when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); *ret_val = 0; if (tlen) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); } return (1); } return (0); } /* * rack_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; int32_t tlen; thflags = *thf; tlen = *tlenp; todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If segment ends after window, drop trailing data (and PUSH and * FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment and * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH | TH_FIN); } *thf = thflags; *tlenp = tlen; return (0); } static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { struct rack_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & RACK_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ counter_u64_add(rack_find_high, 1); prsm = rsm; TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } return (prsm); } return (NULL); } static uint32_t rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) { int32_t lro; uint32_t thresh; /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_reorder_ts) { if (rack->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { lro = cts - rack->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > rack->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ rack->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + rack->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (rack->r_ctl.rc_reorder_shift) thresh += (srtt >> rack->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1; } /* We don't let the rack timeout be above a RTO */ if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); } /* And we don't want it above the RTO max either */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } return (thresh); } static uint32_t rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t srtt) { struct rack_sendmap *prsm; uint32_t thresh, len; int maxseg; if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_tlp_threshold) thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); else thresh = (srtt * 2); /* Get the previous sent packet, if any */ maxseg = tcp_maxseg(tp); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { /* Exactly like the ID */ if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, rack_head, r_tnext); if (prsm && (len <= maxseg)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; counter_u64_add(rack_used_tlpmethod, 1); idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= maxseg) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; counter_u64_add(rack_used_tlpmethod2, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { /* 2.2 behavior */ if (len <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } /* Not above an RTO */ if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { thresh = TICKS_2_MSEC(tp->t_rxtcur); } /* Not above a RTO max */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } /* Apply user supplied min TLP */ if (thresh < rack_tlp_min) { thresh = rack_tlp_min; } return (thresh); } static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t idx; uint32_t srtt_cur, srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) return (NULL); if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused < rsm->r_tim_lastsent[idx]) { return (NULL); } if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { return (NULL); } /* Ok if we reach here we are over-due */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); return (rsm); } static uint32_t rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) { int32_t t; int32_t tt; uint32_t ret_val; t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; ret_val = (uint32_t)tt; return (ret_val); } static uint32_t rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; struct rack_sendmap *rsm; if (rack->t_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (rack->rc_in_persist) { /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } if (tp->t_state < TCPS_ESTABLISHED) goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing on the send map */ activate_rxt: if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); if (to == 0) to = 1; return (to); } return (0); } if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } /* Convert from ms to usecs */ if (rsm->r_flags & RACK_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* * We don't start a rack timer if all we have is a * FIN outstanding. */ goto activate_rxt; } if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < rack->r_ctl.rc_min_to) { to = rack->r_ctl.rc_min_to; } } else { to = rack->r_ctl.rc_min_to; } } else { /* Ok we need to do a TLP not RACK */ if ((rack->rc_tlp_in_progress != 0) || (rack->r_ctl.rc_tlp_rtx_out != 0)) { /* * The previous send was a TLP or a tlp_rtx is in * process. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & RACK_HAS_FIN) { /* If its a FIN we dont do TLP */ rsm = NULL; goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) time_since_sent = cts - rsm->r_tim_lastsent[idx]; else time_since_sent = 0; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); if (thresh > time_since_sent) to = thresh - time_since_sent; else to = rack->r_ctl.rc_min_to; if (to > TCPTV_REXMTMAX) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { /* * The tail is no longer the last one I did a probe * on */ rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = rsm->r_start; } } if (is_tlp_timer == 0) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We have exceeded how many times we can retran the * current TLP timer, switch to the RTO timer. */ goto activate_rxt; } else { rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } } if (to == 0) to = 1; return (to); } static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { if (((tp->t_flags & TF_SENTFIN) == 0) && (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) /* Must need to send more data to enter persist */ return; rack->r_ctl.rc_went_idle_time = cts; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; rack->rc_in_persist = 1; } } static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) { if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_flags &= ~TF_FORCEDATA; tp->t_rxtshift = 0; } static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) { struct inpcb *inp; uint32_t delayed_ack = 0; uint32_t hpts_timeout; uint8_t stopped; uint32_t left = 0; inp = tp->t_inpcb; if (inp->inp_in_hpts) { /* A previous call is already set up */ return; } if (tp->t_state == TCPS_CLOSED) { return; } stopped = rack->rc_tmr_stopped; if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } rack->r_ctl.rc_timer_exp = 0; if (rack->rc_inp->inp_in_hpts == 0) { rack->r_ctl.rc_hpts_flags = 0; } if (slot) { /* We are hptsi too */ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { /* * We are still left on the hpts when the to goes * it will be for output. */ if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) slot = cts - rack->r_ctl.rc_last_output_to; else slot = 1; } if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* No send window.. we must enter persist */ rack_enter_persist(tp, rack, cts); } else if ((frm_out_sbavail && (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && (tp->snd_wnd < tp->t_maxseg)) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If we have no window or we can't send a segment (and have * data to send.. we cheat here and frm_out_sbavail is * passed in with the sbavail(sb) only from bbr_output) and * we are established, then we must enter persits (if not * already in persits). */ rack_enter_persist(tp, rack, cts); } hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { delayed_ack = tcp_delacktime; rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) hpts_timeout = delayed_ack; else rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; /* * If no timers are going to run and we will fall off the hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { /* Get the established keep-alive time */ hpts_timeout = TP_KEEPIDLE(tp); } else { /* Get the initial setup keep-alive time */ hpts_timeout = TP_KEEPINIT(tp); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2,147,483 * seconds (a bit more than 596 hours), which is probably ok * :). */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if (slot) { rack->r_ctl.rc_last_output_to = cts + slot; if ((hpts_timeout == 0) || (hpts_timeout > slot)) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } else { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } } else if (hpts_timeout) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", tp, rack, tot_len_this_send, cts, slot, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; if (slot) rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); } /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ struct rack_sendmap *rsm; int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } rack_log_to_event(rack, RACK_TO_FRM_RACK); recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rsm = rack_check_recovery_mode(tp, cts); if (rsm) { uint32_t rtt; rtt = rack->rc_rack_rtt; if (rtt == 0) rtt = 1; if ((recovery == 0) && (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { /* * When a rack timer goes, we have to send at * least one segment. They will be paced a min of 1ms * apart via the next rack timer (or further * if the rack timer dictates it). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } } else { /* This is a case that should happen rarely if ever */ counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; return (0); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then * send it out. * * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); return (1); } /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ rack_log_to_event(rack, RACK_TO_FRM_TLP); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; rack->rc_timer_up = 1; /* * If we are in recovery we can jazz out a segment if new data is * present simply by setting rc_prr_sndcnt to a segment. */ if ((avail > out) && ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { /* New data is available */ amm = avail - out; if (amm > tp->t_maxseg) { amm = tp->t_maxseg; } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ old_prr_snd = rack->r_ctl.rc_prr_sndcnt; if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_prr_sndcnt = amm; else goto need_retran; } else { /* Set the send-new override */ if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_tlp_new_data = amm; else goto need_retran; } rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = tp->snd_max; rack->r_ctl.rc_tlpsend = NULL; counter_u64_add(rack_tlp_newdata, 1); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (rack_always_send_oldest) rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); else { rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { rsm = rack_find_high_nonack(rack, rsm); } } if (rsm == NULL) { counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif goto out; } if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { /* * We need to split this the last segment in two. */ int32_t idx; struct rack_sendmap *nrsm; nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt * off to the RXT timer. */ counter_u64_add(rack_tlp_does_nada, 1); goto out; } nrsm->r_start = (rsm->r_end - tp->t_maxseg); nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } rack->r_ctl.rc_tlpsend = rsm; rack->r_ctl.rc_tlp_rtx_out = 1; if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { rack->r_ctl.rc_tlp_seg_send_cnt++; tp->t_rxtshift++; } else { rack->r_ctl.rc_last_tlp_seq = rsm->r_start; rack->r_ctl.rc_tlp_seg_send_cnt = 1; } send: rack->r_ctl.rc_tlp_send_cnt++; if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { /* * Can't [re]/transmit a segment we have not heard from the * peer in max times. We need the retransmit timer to take * over. */ restore: rack->r_ctl.rc_tlpsend = NULL; if (rsm) rsm->r_flags &= ~RACK_TLP; rack->r_ctl.rc_prr_sndcnt = old_prr_snd; counter_u64_add(rack_tlp_retran_fail, 1); goto out; } else if (rsm) { rsm->r_flags |= RACK_TLP; } if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We don't want to send a single segment more than the max * either. */ goto restore; } rack->r_timer_override = 1; rack->r_tlp_running = 1; rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: rack->rc_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack_log_to_event(rack, RACK_TO_FRM_DELACK); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Persists timer, here we simply need to setup the * FORCE-DATA flag the output routine will send * the one byte send. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack->rc_in_persist == 0) return (0); if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) rack_exit_persist(tp, rack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { retval = 1; TCPSTAT_INC(tcps_persistdrop); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } tp->t_flags |= TF_FORCEDATA; out: rack_log_to_event(rack, RACK_TO_FRM_PERSIST); return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; rack_log_to_event(rack, RACK_TO_FRM_KEEP); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void rack_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct rack_sendmap *rsm, *trsm = NULL; struct tcp_rack *rack; int32_t cnt = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. * However, if we do that we would burst out * all that data 1ms apart. This would be unwise, * so for now we will just let the normal rxt timer * and tlp timer take care of it. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_flags & RACK_ACKED) { cnt++; rsm->r_sndcnt = 0; if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; trsm = rsm; } } rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ rack->r_ctl.rc_sacked = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); /* Setup so we send one segment */ if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { int32_t rexmt; struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } rack_remxt_tmr(tp); if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, max(MSEC_2_TICKS(rack_rto_min), rexmt), MSEC_2_TICKS(rack_rto_max)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int32_t isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * Disable RFC1323 and SACK if we haven't got any response to our * third SYN to work-around some broken terminal servers (most of * which have hopefully been retired) that have bad VJ header * compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; rack_cong_signal(tp, NULL, CC_RTO); out: return (retval); } static int rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { uint32_t left; if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; rack_log_to_processing(rack, cts, ret, 0); return (0); } if (hpts_calling == 0) { ret = -2; rack_log_to_processing(rack, cts, ret, 0); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ ret = -3; left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); rack->rc_last_pto_set = 0; return (1); } rack->rc_tmr_stopped = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); } else if (timers & PACE_TMR_KEEP) { ret = rack_timeout_keepalive(tp, rack, cts); } rack_log_to_processing(rack, cts, ret, timers); return (ret); } static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) { uint8_t hpts_removed = 0; if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_inp->inp_in_hpts && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } rack_log_to_cancel(rack, hpts_removed, line); rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } } static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) { return; } static int rack_stopall(struct tcpcb *tp) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->t_timers_stopped = 1; return (0); } static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } static int rack_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static void rack_stop_all_timers(struct tcpcb *tp) { struct tcp_rack *rack; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ rack = (struct tcp_rack *)tp->t_fb_ptr; rack->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts) { int32_t idx; rsm->r_rtr_cnt++; rsm->r_sndcnt++; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; if (rsm->r_flags & RACK_ACKED) { /* Problably MTU discovery messing with us */ rsm->r_flags &= ~RACK_ACKED; rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & RACK_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; } /* Update memory for next rtr */ rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); } static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct rack_sendmap *nrsm; uint32_t c_end; int32_t len; int32_t idx; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ rack_update_rsm(tp, rack, rsm, ts); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. */ *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ nrsm->r_start = c_end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = c_end; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rack_update_rsm(tp, rack, rsm, ts); *lenp = 0; return (0); } static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; int32_t idx; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ /* * If err is set what do we do XXXrrs? should we not add the thing? * -- i.e. return if err != 0 or should we pretend we sent it? -- * i.e. proceed with add ** do this for now. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; snd_una = tp->snd_una; if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; len = end - seq_out; } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { /* * The call to rack_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if (th_flags & TH_SYN) len++; if (th_flags & TH_FIN) len++; if (SEQ_LT(snd_max, tp->snd_nxt)) { /* * The add/update as not been done for the FIN/SYN * yet. */ snd_max = tp->snd_nxt; } } if (len == 0) { /* We don't log zero window probes */ return; } rack->r_ctl.rc_time_last_sent = ts; if (IN_RECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } /* First question is it a retransmission? */ if (seq_out == snd_max) { again: rsm = rack_alloc(rack); if (rsm == NULL) { /* * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ #ifdef INVARIANTS panic("Out of memory when we should not be rack:%p", rack); #endif return; } if (th_flags & TH_FIN) { rsm->r_flags = RACK_HAS_FIN; } else { rsm->r_flags = 0; } rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ rsm->r_start = seq_out + 1; rsm->r_end = rsm->r_start + (len - 1); } else { /* Normal case */ rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; } rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; return; } /* * If we reach here its a retransmission and we need to find it. */ more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else if (rack->r_ctl.rc_next) { /* We have a hint from a previous run */ rsm = rack->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { /* * We used rc_next or hintrsm to retransmit, hopefully the * likely case. */ seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_start == seq_out) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { continue; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = rack_alloc(rack); if (nrsm == NULL) { #ifdef INVARIANTS panic("Ran out of memory that was preallocated? rack:%p", rack); #endif rack_update_rsm(tp, rack, rsm, ts); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ nrsm->r_start = seq_out; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); if (len == 0) { return; } } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", rack, tp); #endif } else { #ifdef INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } /* * Record one of the RTT updates from an ack into * our sample structure. */ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) { if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; } if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { rack->r_ctl.rack_rs.rs_rtt_highest = rtt; } rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; rack->r_ctl.rack_rs.rs_rtt_tot += rtt; rack->r_ctl.rack_rs.rs_rtt_cnt++; } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) { int32_t delta; uint32_t o_srtt, o_var; int32_t rtt; if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) /* No valid sample */ return; if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { /* We are to use the lowest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { /* We are to use the highest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_highest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { /* We are to use the average RTT seen in a single ack */ rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); } else { #ifdef INVARIANTS panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); #endif return; } if (rtt == 0) rtt = 1; rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } TCPSTAT_INC(tcps_rttupdated); rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); tp->t_rttupdated++; #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); #endif tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); tp->t_softerror = 0; } static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ struct tcp_rack *rack; if (rsm->r_flags & RACK_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & RACK_TLP) { /* * We expect TLP's to have this occur. */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* should we undo cc changes and exit recovery? */ if (IN_RECOVERY(tp->t_flags)) { if (rack->r_ctl.rc_rsm_start == rsm->r_start) { /* * Undo what we ratched down and exit recovery if * possible */ EXIT_RECOVERY(tp->t_flags); tp->snd_recover = tp->snd_una; if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; } } if (rsm->r_flags & RACK_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } counter_u64_add(rack_badfr, 1); counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); } static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) { int32_t i; uint32_t t; if (rsm->r_flags & RACK_ACKED) /* Already done */ return (0); if ((rsm->r_rtr_cnt == 1) || ((ack_type == CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr) && (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) ) { /* * We will only find a matching timestamp if its cum-acked. * But if its only one retransmission its for-sure matching * :-) */ t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { rack->r_ctl.rc_rsm_start = tp->snd_max; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure * we send one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else rack->r_ctl.rc_tlp_rtx_out = 0; } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } /* * We clear the soft/rxtshift since we got an ack. * There is no assurance we will call the commit() function * so we need to clear these to avoid incorrect handling. */ tp->t_rxtshift = 0; tp->t_softerror = 0; if ((to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { /* * Now which timestamp does it match? In this block the ACK * must be coming from a previous transmission. */ for (i = 0; i < rsm->r_rtr_cnt; i++) { if (rsm->r_tim_lastsent[i] == to->to_tsecr) { t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ rack_earlier_retran(tp, rsm, t, cts); } if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } /* * Note the following calls to * tcp_rack_xmit_timer() are being commented * out for now. They give us no more accuracy * and often lead to a wrong choice. We have * enough samples that have not been * retransmitted. I leave the commented out * code in here in case in the future we * decide to add it back (though I can't forsee * doing that). That way we will easily see * where they need to be placed. */ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } } goto ts_not_found; } else { /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. */ ts_not_found: i = rsm->r_rtr_cnt - 1; t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed. We most * likey did an improper retransmit as outlined in * 4.2 Step 3 point 2 in the rack-draft. */ i = rsm->r_rtr_cnt - 2; t = cts - rsm->r_tim_lastsent[i]; rack_earlier_retran(tp, rsm, t, cts); } else if (rack->r_ctl.rc_rack_min_rtt) { /* * We retransmitted it and the retransmit did the * job. */ if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; rack->rc_rack_rtt = t; } return (1); } } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *nrsm; uint32_t ts; int32_t idx; idx = rsm->r_rtr_cnt - 1; ts = rsm->r_tim_lastsent[idx]; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & RACK_ACKED) { /* Skip ack'd segments */ continue; } idx = nrsm->r_rtr_cnt - 1; if (ts == nrsm->r_tim_lastsent[idx]) { /* * For this case lets use seq no, if we sent in a * big block (TSO) we would have a bunch of segments * sent at the same time. * * We would only get a report if its SEQ is earlier. * If we have done multiple retransmits the times * would not be equal. */ if (SEQ_LT(nrsm->r_start, rsm->r_start)) { nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } else { /* * Here they were sent at different times, not a big * block. Since we transmitted this one later and * see it sack'd then this must also be missing (or * we would have gotten a sack block for it) */ nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } } static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) { int32_t idx; int32_t times = 0; uint32_t start, end, changed = 0; struct rack_sendmap *rsm, *nrsm; int32_t used_ref = 1; start = sack->start; end = sack->end; rsm = *prsm; if (rsm && SEQ_LT(start, rsm->r_start)) { TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { goto do_rest_ofb; } } } if (rsm == NULL) { start_at_beginning: rsm = NULL; used_ref = 0; } /* First lets locate the block where this guy is */ TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { break; } } do_rest_ofb: if (rsm == NULL) { /* * This happens when we get duplicate sack blocks with the * same end. For example SACK 4: 100 SACK 3: 100 The sort * will not change there location so we would just start at * the end of the first one and get lost. */ if (tp->t_flags & TF_SENTFIN) { /* * Check to see if we have not logged the FIN that * went out. */ nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { /* * Ok we did not get the FIN logged. */ nrsm->r_end++; rsm = nrsm; goto do_rest_ofb; } } if (times == 1) { #ifdef INVARIANTS panic("tp:%p rack:%p sack:%p to:%p prsm:%p", tp, rack, sack, to, prsm); #else goto out; #endif } times++; counter_u64_add(rack_sack_proc_restart, 1); goto start_at_beginning; } /* Ok we have an ACK for some piece of rsm */ if (rsm->r_start != start) { /* * Need to split this in two pieces the before and after. */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * failed XXXrrs what can we do but loose the sack * info? */ goto out; } nrsm->r_start = start; nrsm->r_rtr_bytes = 0; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { /* * The end of this block is either beyond this guy or right * at this guy. */ if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } if (end == rsm->r_end) { /* This block only - done */ goto out; } /* There is more not coverend by this rsm move on */ start = rsm->r_end; nrsm = TAILQ_NEXT(rsm, r_next); rsm = nrsm; times = 0; goto do_rest_ofb; } /* Ok we need to split off this one at the tail */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* failed rrs what can we do but loose the sack info? */ goto out; } /* Clone it */ nrsm->r_start = end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_bytes = 0; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } /* The sack block does not cover this guy fully */ rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_end = end; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } if (rsm->r_flags & RACK_ACKED) { /* Been here done that */ goto out; } rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } out: if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { counter_u64_add(rack_sack_proc_short, 1); } /* Save off where we last were */ if (rsm) rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); else rack->r_ctl.rc_sacklast = NULL; *prsm = rsm; return (changed); } static void inline rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) { struct rack_sendmap *tmap; tmap = NULL; while (rsm && (rsm->r_flags & RACK_ACKED)) { /* Its no longer sacked, mark it so */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef INVARIANTS if (rsm->r_in_tmap) { panic("rack:%p rsm:%p flags:0x%x in tmap?", rack, rsm, rsm->r_flags); } #endif rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; rsm = TAILQ_NEXT(rsm, r_next); } /* * Now lets possibly clear the sack filter so we start * recognizing sacks that cover this area. */ if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); } static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) { uint32_t changed, last_seq, entered_recovery = 0; struct tcp_rack *rack; struct rack_sendmap *rsm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_ts_getticks(); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); changed = 0; th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) changed = th_ack - rsm->r_start; if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ rack->r_wanted_output++; more: rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* * For the SYN incoming case we will not * have called tcp_output for the sending of * the SYN, so there will be no map. All * other cases should probably be a panic. */ goto proc_sack; } if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef INVARIANTS panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", tp, th, tp->t_state, rack, tp->snd_una, tp->snd_max, tp->snd_nxt, changed); #endif goto proc_sack; } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", rsm->r_start, th_ack, tp->t_state, rack->r_state); #endif goto proc_sack; } rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (rack->r_ctl.rc_next == rsm) { /* scoot along the marker */ rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove * it from total */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } else if (rsm->r_flags & RACK_SACK_PASSED) { /* * There are acked segments ACKED on the * scoreboard further up. We are seeing * reordering. */ counter_u64_add(rack_reorder_seen, 1); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; } left = th_ack - rsm->r_end; if (rsm->r_rtr_cnt > 1) { /* * Technically we should make r_rtr_cnt be * monotonicly increasing and just mod it to * the timestamp it is replacing.. that way * we would have the last 3 retransmits. Now * rc_loss_count will be wrong if we * retransmit something more than 2 times in * recovery :( */ rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); } /* Free back to zone */ rack_free(rack, rsm); if (left) { goto more; } goto proc_sack; } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove it from * total for the part being cum-acked. */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; rsm->r_start = th_ack; } proc_sack: /* Check for reneging */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to * the edge of this send, i.e. one * that it had previously acked. The only * way that can be true if the peer threw * away data (space issues) that it had * previously sacked (else it would have * given us snd_una up to (rsm->r_end). * We need to undo the acked markings here. * * Note we have to look to make sure th_ack is * our rsm->r_start in case we get an old ack * where th_ack is behind snd_una. */ rack_peer_reneges(rack, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left to log */ goto out; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm) { last_seq = rsm->r_end; } else { last_seq = tp->snd_max; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && (SEQ_LT(sack.end, last_seq)) && ((sack.end - sack.start) < (tp->t_maxseg / 8))) { /* * Not the last piece and its smaller than * 1/8th of a MSS. We ignore this. */ counter_u64_add(rack_runt_sacks, 1); continue; } sack_blocks[num_sack_blks] = sack; num_sack_blks++; #ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ tcp_record_dsack(sack.start, sack.end); #endif } } if (num_sack_blks == 0) goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ if (rack_use_sack_filter) { num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); } if (num_sack_blks < 2) { goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implememtations send these)? */ again: if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: rsm = rack->r_ctl.rc_sacklast; for (i = 0; i < num_sack_blks; i++) { acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); if (acked) { rack->r_wanted_output++; changed += acked; sack_changed += acked; } } out: if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { /* * Ok we have a high probability that we need to go in to * recovery since we have data sack'd */ struct rack_sendmap *rsm; uint32_t tsused; tsused = tcp_ts_getticks(); rsm = tcp_rack_output(tp, rack, tsused); if (rsm) { /* Enter recovery */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; entered_recovery = 1; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { /* Deal with changed an PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; /* Compute prr_sndcnt */ if (SEQ_GT(tp->snd_una, th_ack)) { snd_una = tp->snd_una; } else { snd_una = th_ack; } pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; if (pipe > tp->snd_ssthresh) { long sndcnt; sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; if (rack->r_ctl.rc_prr_recovery_fs > 0) sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; sndcnt = 0; } sndcnt++; if (sndcnt > (long)rack->r_ctl.rc_prr_out) sndcnt -= rack->r_ctl.rc_prr_out; else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; } else { uint32_t limit; if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); else limit = 0; if (changed > limit) limit = changed; limit += tp->t_maxseg; if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); } } if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { rack->r_timer_override = 1; } } } /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. * For ret_val if its 0 the TCP is locked, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, - int32_t * ti_locked, uint32_t tiwin, int32_t tlen, + uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; int32_t acked; struct mbuf *mfree; struct tcp_rack *rack; int32_t recovery = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind (or duplicate to) the last one rcv'd * Note: Should mark reordering is occuring! We should also * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 */ return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } nsegs = max(1, m->m_pkthdr.lro_nsegs); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * If we have a timestamp reply, update smoothed round trip time. If * no timestamp is present but transmit timer is running and timed * sequence number was acked, update smoothed round trip time. Since * we now have an rtt measurement, cancel the timer backoff (cf., * Phil Karn's retransmit alg.). Recompute the initial retransmit * timer. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ /* * If all outstanding data is acked, stop retransmit timer and * remember to restart (more output or persist). If there is more * data to be acked, restart retransmit timer, using current * (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack->r_wanted_output++; } /* * If no data (only SYN) was ACK'd, skip rest of ACK processing. */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; return (0); } if (rack->r_ctl.rc_early_recovery) { if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); recovery = 1; } } } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); if ((sbused(&so->so_snd) == 0) && (acked > acked_amount) && (tp->t_state >= TCPS_FIN_WAIT_1)) { ourfinisacked = 1; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); } } } tp->snd_una = th->th_ack; if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { tp->snd_nxt = tp->snd_una; } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output++; if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the * peer sent data, time to * reset him. */ *ret_val = 1; tp = tcp_close(tp); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } if (ofia) *ofia = ourfinisacked; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ int32_t nsegs; int32_t tfo_syn; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; rack->r_wanted_output++; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } /* Was persist timer active and now we have window space? */ if ((rack->rc_in_persist != 0) && tp->snd_wnd) { rack_exit_persist(tp, rack); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output++; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept random * urgent pointers, we'll crash in soreceive. It's hard to * imagine someone actually wanting to send this much urgent * data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, then * mark the data stream. This should not happen in * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a * FIN has been received from the remote side. In these * states we ignore the URG. * * According to RFC961 (Assigned Protocols), the urgent * pointer points to the last octet of urgent data. We * continue, however, to consider it to indicate the first * octet of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t) tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, pull receive urgent * pointer along with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0) tcp_update_sack_list(tp, save_start, save_start + tlen); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " - "TCP_FIN_WAIT_2 ti_locked: %d", __func__, - *ti_locked)); tcp_twstart(tp); - *ti_locked = TI_UNLOCKED; - INP_INFO_RUNLOCK(&V_tcbinfo); return (1); } } - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { rack->r_wanted_output++; } - KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, *ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) + uint32_t tiwin, int32_t nxt_pkt) { int32_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_rack *rack; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (__predict_false(th->th_seq != tp->rcv_nxt)) { return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } nsegs = max(1, m->m_pkthdr.lro_nsegs); /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); rack_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; rack->r_wanted_output++; } if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) + uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) { int32_t acked; int32_t nsegs; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif struct tcp_rack *rack; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { rack_exit_persist(tp, rack); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ - if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; - } TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* ND6_HINT(tp); *//* Some progress has been made. */ /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) { rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ sowwakeup(so); if (sbavail(&so->so_snd)) { rack->r_wanted_output++; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una if seg * contains an ECE and ECN support is enabled, the stream is ECN * capable. if SYN has been acked change to ESTABLISHED else * SYN_RCVD state arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); return (1); } if (thflags & TH_RST) { - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } if (SEQ_GT(th->th_ack, tp->snd_una)) { /* * We advance snd_una for the * fast open case. If th_ack is * acknowledging data beyond * snd_una we can't just call * ack-processing since the * data stream in our send-map * will start at snd_una + 1 (one * beyond the SYN). If its just * equal we don't need to do that * and there is no send_map. */ tp->snd_una++; } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " - "ti_locked %d", __func__, *ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to regular * ACK processing below. */ tp->snd_una++; } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); } /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void)tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && __predict_true(LIST_EMPTY(&tp->t_segq)) && __predict_true(th->th_seq == tp->rcv_nxt)) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tlen == 0) { if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, - ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { + tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { return (0); } } else { if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, - ti_locked, tiwin, nxt_pkt)) { + tiwin, nxt_pkt)) { return (0); } } } rack_calc_rwin(so, tp); if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in rack_process_data() */ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } static int rack_check_data_after_close(struct mbuf *m, - struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) + struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { - struct tcp_rack *rack; + struct tcp_rack *rack; - KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " - "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); + rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; rack->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { - if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { - if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); - *ti_locked = TI_UNLOCKED; m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { - if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp, ti_locked)); + return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, ti_locked, &ret_val); + rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) + if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { + if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { - if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) + if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); + rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - ti_locked, tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } static void inline rack_clear_rate_sample(struct tcp_rack *rack) { rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; rack->r_ctl.rack_rs.rs_rtt_cnt = 0; rack->r_ctl.rack_rs.rs_rtt_tot = 0; } static int rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); rack = (struct tcp_rack *)tp->t_fb_ptr; TAILQ_INIT(&rack->r_ctl.rc_map); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_cpu = 0; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; rack->rc_pace_reduce = rack_slot_reduction; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; rack->rc_pace_max_segs = rack_hptsi_segments; rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; rack->r_enforce_min_pace = rack_min_pace_time; rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->rc_always_pace = rack_pace_every_seg; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; rack->r_ctl.rc_prr_inc_var = rack_inc_var; rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; rsm = rack_alloc(rack); if (rsm == NULL) { uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; rsm->r_tim_lastsent[0] = tcp_ts_getticks(); rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } return (0); } static int rack_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know you have to get to ESTAB or beyond * to tell. */ return (EAGAIN); } if (tp->t_flags & TF_SACK_PERMIT) { return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm; rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } rack->rc_free_cnt = 0; uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } } static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { switch (tp->t_state) { case TCPS_SYN_SENT: rack->r_state = TCPS_SYN_SENT; rack->r_substate = rack_do_syn_sent; break; case TCPS_SYN_RECEIVED: rack->r_state = TCPS_SYN_RECEIVED; rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; case TCPS_FIN_WAIT_1: rack->r_state = TCPS_FIN_WAIT_1; rack->r_substate = rack_do_fin_wait_1; break; case TCPS_CLOSING: rack->r_state = TCPS_CLOSING; rack->r_substate = rack_do_closing; break; case TCPS_LAST_ACK: rack->r_state = TCPS_LAST_ACK; rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: #ifdef INVARIANTS panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); #endif break; }; } static void rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) { /* * We received an ack, and then did not * call send or were bounced out due to the * hpts was running. Now a timer is up as well, is * it the right timer? */ struct rack_sendmap *rsm; int tmr_up; tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* We are supposed to have delayed ack up and we do */ return; } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the possiblity * of nothing outstanding and the RXT up (and the hptsi timer). */ return; } else if (((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* needs to be a RXT */ if (tmr_up == PACE_TMR_RXT) return; } else if (tmr_up == PACE_TMR_RACK) return; } else if (SEQ_GT(tp->snd_max,tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RXT))) { /* * Either a TLP or RXT is fine if no sack-passed * is in place and data is outstanding. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off * before the rtx/tlp/rack timer were going to * expire, then that would be the timer in control. * Note we don't check the time here trusting the * code is correct. */ return; } /* * Ok the timer originally started is not what we want now. * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, - int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) + int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval, did_out = 0; int32_t way_out = 0; uint32_t cts; uint32_t tiwin; struct tcpopt to; struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t prev_state = 0; cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; /* * If this is either a state-changing packet or current state isn't * established, we require a read lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " - "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true); } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { #ifdef NETFLIX_CWV if ((tp->cwv_enabled) && ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { tcp_newcwv_nvp_closedown(tp); } else #endif if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } } rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) tcp_newcwv_nvp_closedown(tp); } #endif /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { rack_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, cts)) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (rack->r_state == 0) { /* Should be init'd by rack_init() */ KASSERT(rack->rc_inp != NULL, ("%s: rack->rc_inp unexpectedly NULL", __func__)); if (rack->rc_inp == NULL) { rack->rc_inp = tp->t_inpcb; } /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ rack->r_cpu = inp_to_cpuid(tp->t_inpcb); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = cts; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set, if not rack is *not* possible and * we switch to the default code. */ if ((tp->t_flags & TF_SACK_PERMIT) == 0) { tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, - tlen, iptos, ti_locked); + tlen, iptos); return; } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); rack_stop_all_timers(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ if (rack->r_state != tp->t_state) rack_set_state(tp, rack); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; rack->r_ctl.rc_tlp_send_cnt = 0; rack_clear_rate_sample(rack); retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, - tlen, &ti_locked, tiwin, thflags, nxt_pkt); + tlen, tiwin, thflags, nxt_pkt); #ifdef INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif - if (ti_locked != TI_UNLOCKED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - } if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_rack_xmit_timer_commit(rack, tp); if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && (rack->rc_in_persist == 0)){ /* * The peer shrunk its window on us to the point * where we have sent too much. The only thing * we can do here is stop any timers and * enter persist. We most likely lost the last * bytes we sent but oh well, we will have to * retransmit them after the peer is caught up. */ if (rack->rc_inp->inp_in_hpts) tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack_timer_cancel(tp, rack, cts, __LINE__); rack_enter_persist(tp, rack, cts); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); way_out = 3; goto done_with_input; } if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); } if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { if (rack->rc_inp->inp_in_hpts) tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } way_out = 1; } else { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; #ifdef INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif INP_WUNLOCK(tp->t_inpcb); } } void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, - int32_t ti_locked) + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; #ifdef RSS struct tcp_function_block *tfb; struct tcp_rack *rack; - struct inpcb *inp; + struct epoch_tracker et; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_state == 0) { /* * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ - INP_INFO_RLOCK(); - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, ti_locked, 0, &tv); + tlen, iptos, 0, &tv); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return; } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); + tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); INP_WUNLOCK(tp->t_inpcb); #else tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, ti_locked, 0, &tv); + tlen, iptos, 0, &tv); #endif } struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) { struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; /* Return the next guy to be re-transmitted */ if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* retran the end FIN? */ return (NULL); } /* ok lets look at this one */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { goto check_it; } rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { return (NULL); } check_it: srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; if (rsm->r_flags & RACK_ACKED) { return (NULL); } if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { /* Its not yet ready */ return (NULL); } idx = rsm->r_rtr_cnt - 1; ts_low = rsm->r_tim_lastsent[idx]; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused <= ts_low) { return (NULL); } if ((tsused - ts_low) >= thresh) { return (rsm); } return (NULL); } static int rack_output(struct tcpcb *tp) { struct socket *so; uint32_t recwin, sendwin; uint32_t sb_offset; int32_t len, flags, error = 0; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct udphdr *udp = NULL; struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen=0; uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int32_t idle, sendalot; int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; int32_t tso, mtu, would_have_fin = 0; struct tcpopt to; int32_t slot = 0; uint32_t cts; uint8_t hpts_calling, doing_tlp = 0; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; struct sockbuf *sb; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ return (0); #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ isipv6 = rack->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif cts = tcp_ts_getticks(); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. */ rack_timer_cancel(tp, rack, cts, __LINE__); } /* Mark that we have called rack_output(). */ if ((rack->r_timer_override) || (tp->t_flags & TF_FORCEDATA) || (tp->t_state < TCPS_ESTABLISHED)) { if (tp->t_inpcb->inp_in_hpts) tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } else if (tp->t_inpcb->inp_in_hpts) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. */ counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (rack_process_timers(tp, rack, cts, hpts_calling)) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } rack->r_wanted_output = 0; rack->r_timer_override = 0; /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls * (SYN, RST) to send, then transmit; otherwise, investigate * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) tcp_newcwv_nvp_closedown(tp); } else #endif if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; cts = tcp_ts_getticks(); tso = 0; mtu = 0; sb_offset = tp->snd_max - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly * trying to send out new data (when sendalot is 1), bypass this * function. If we retransmit in fast recovery mode, decrement * snd_cwnd, since we're replacing a (future) new transmission with * a retransmission now, and we previously incremented snd_cwnd in * tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { if (inp->inp_hpts_calls) /* Retry in a ms */ slot = 1; goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt++; rsm = NULL; } if (inp->inp_hpts_calls) inp->inp_hpts_calls = 0; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ long cwin; long tlen; doing_tlp = 1; rsm = rack->r_ctl.rc_tlpsend; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; if (tlen > tp->t_maxseg) tlen = tp->t_maxseg; #ifdef INVARIANTS if (SEQ_GT(tp->snd_una, rsm->r_start)) { panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", tp, rack, tp->snd_una, rsm, rsm->r_start); } #endif sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; } else if (rack->r_ctl.rc_resend) { /* Retransmit timer */ rsm = rack->r_ctl.rc_resend; rack->r_ctl.rc_resend = NULL; len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; sb_offset = rsm->r_start - tp->snd_una; if (len >= tp->t_maxseg) { len = tp->t_maxseg; } KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", __func__, sb_offset)); } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { long tlen; if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif tlen = rsm->r_end - rsm->r_start; sb_offset = rsm->r_start - tp->snd_una; if (tlen > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; } else { len = tlen; } if (len >= tp->t_maxseg) { sendalot = 1; len = tp->t_maxseg; } else { sendalot = 0; if ((rack->rc_timer_up == 0) && (len < tlen)) { /* * If its not a timer don't send a partial * segment. */ len = 0; goto just_return_nolock; } } KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", __func__, sb_offset)); if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); counter_u64_add(rack_rtm_prr_retran, 1); } } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe. */ flags &= ~TH_FIN; } } #ifdef INVARIANTS /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If in persist timeout with window of 0, send 1 byte. Otherwise, * if window is small but nonzero and time TF_SENTFIN expired, we * will send what we can and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then clear * the FIN bit. Usually this would happen below * when it realizes that we aren't sending all the * data. However, if we have exactly 1 byte of * unsent data, then it won't clear the FIN bit * below, and if we are in persist state, we wind up * sending the packet without recording that we sent * the FIN bit. * * We can't just blindly clear the FIN bit, because * if we don't have any more data to send then the * probe will be the FIN itself. */ if (sb_offset < sbused(sb)) flags &= ~TH_FIN; sendwin = 1; } else { if (rack->rc_in_persist) rack_exit_persist(tp, rack); /* * If we are dropping persist mode then we need to * correct snd_nxt/snd_max and off. */ tp->snd_nxt = tp->snd_max; sb_offset = tp->snd_nxt - tp->snd_una; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; if (IN_RECOVERY(tp->t_flags) == 0) { if (rack->r_ctl.rc_tlp_new_data) { /* TLP is forcing out new data */ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); } if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) len = tp->snd_wnd; else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; doing_tlp = 1; } else { if (sendwin > avail) { /* use the available */ if (avail > sb_offset) { len = (int32_t)(avail - sb_offset); } else { len = 0; } } else { if (sendwin > sb_offset) { len = (int32_t)(sendwin - sb_offset); } else { len = 0; } } } } else { uint32_t outstanding; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) len = 0; else if (avail > sb_offset) len = avail - sb_offset; else len = 0; if (len > 0) { if (len > rack->r_ctl.rc_prr_sndcnt) len = rack->r_ctl.rc_prr_sndcnt; if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); } } if (len > tp->t_maxseg) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr * mode unless the override flag is on. Most * likely the PRR algorithm is not going to * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) len = tp->t_maxseg; } else if (len < tp->t_maxseg) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to * do it. However if not then lets just wait * for our prr_sndcnt to get bigger. */ long leftinsb; leftinsb = sbavail(sb) - sb_offset; if (leftinsb > len) { /* This send does not empty the sb */ len = 0; } } } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; sb_offset--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) len = 0; /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) tso = 1; { uint32_t outstanding; outstanding = tp->snd_max - tp->snd_una; if (tp->t_flags & TF_SENTFIN) { /* * If we sent a fin, snd_max is 1 higher than * snd_una */ outstanding--; } if (outstanding > 0) { /* * This is sub-optimal. We only send a stand alone * FIN on its own segment. */ if (flags & TH_FIN) { flags &= ~TH_FIN; would_have_fin = 1; } } else if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(sb))) flags &= ~TH_FIN; } } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) { pass = 1; goto send; } /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && (tp->t_flags & TF_NOPUSH) == 0) { pass = 2; goto send; } if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ pass = 3; goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { pass = 4; goto send; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ pass = 5; goto send; } if (sack_rxmit) { pass = 6; goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, taking * into account that we are limited by TCP_MAXWIN << * tp->rcv_scale. */ int32_t adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { pass = 7; goto send; } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { pass = 8; goto send; } if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { pass = 9; goto send; } if (SEQ_GT(tp->snd_up, tp->snd_una)) { pass = 10; goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if (flags & TH_FIN) { if ((tp->t_flags & TF_SENTFIN) || (((tp->t_flags & TF_SENTFIN) == 0) && (tp->snd_nxt == tp->snd_una))) { pass = 11; goto send; } } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: if (tot_len_this_send == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); tp->t_flags &= ~TF_FORCEDATA; return (0); send: if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the * rxt firing, we want to leave the tlp_in_progress flag on * so we don't send another TLP. It has to be a rack timer * or normal send (response to acked data) to clear the tlp * in progress flag. */ rack->rc_tlp_in_progress = 0; } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ sendalot = 0; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = cts + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } #ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { if (flags & TH_FIN) { would_have_fin = 1; flags &= ~TH_FIN; } if (tso) { uint32_t if_hw_tsomax; uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); if ((sb_offset + len) < sbavail(sb)) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment after the bulk * sending is done. We don't trust the TSO * implementations to clear the FIN flag on all but * the last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t max_val; uint32_t moff; if (rack->rc_pace_max_segs) max_val = rack->rc_pace_max_segs * tp->t_maxseg; else max_val = len; /* * We allow a limit on sending with hptsi. */ if (len > max_val) { len = max_val; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = sb; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef NETFLIX_STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but * in its own bin */ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef NETFLIX_STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) flags |= TH_PUSH; /* * Are we doing hptsi, if so we must calculate the slot. We * only do hptsi in ESTABLISHED and with no RESET being * sent where we have data to send. */ if (((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_CLOSE_WAIT) || ((tp->t_state == TCPS_FIN_WAIT_1) && ((tp->t_flags & TF_SENTFIN) == 0) && ((flags & TH_FIN) == 0))) && ((flags & TH_RST) == 0) && (rack->rc_always_pace)) { /* * We use the most optimistic possible cwnd/srtt for * sending calculations. This will make our * calculation anticipate getting more through * quicker then possible. But thats ok we don't want * the peer to have a gap in data sending. */ uint32_t srtt, cwnd, tr_perms = 0; if (rack->r_ctl.rc_rack_min_rtt) srtt = rack->r_ctl.rc_rack_min_rtt; else srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); if (rack->r_ctl.rc_rack_largest_cwnd) cwnd = rack->r_ctl.rc_rack_largest_cwnd; else cwnd = tp->snd_cwnd; tr_perms = cwnd / srtt; if (tr_perms == 0) { tr_perms = tp->t_maxseg; } tot_len_this_send += len; /* * Calculate how long this will take to drain, if * the calculation comes out to zero, thats ok we * will use send_a_lot to possibly spin around for * more increasing tot_len_this_send to the point * that its going to require a pace, or we hit the * cwnd. Which in that case we are just waiting for * a ACK. */ slot = tot_len_this_send / tr_perms; /* Now do we reduce the time so we don't run dry? */ if (slot && rack->rc_pace_reduce) { int32_t reduce; reduce = (slot / rack->rc_pace_reduce); if (reduce < slot) { slot -= reduce; } else slot = 0; } if (rack->r_enforce_min_pace && (slot == 0) && (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { /* We are enforcing a minimum pace time of 1ms */ slot = rack->r_enforce_min_pace; } } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } /* * Fill in fields, remembering maximum advertised window for use in * delaying messages about window sizes. If resending a FIN, be sure * not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup SYN packet. If we * are on a retransmit, we may resend those bits a number of times * as per RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE | TH_CWR; } else flags |= TH_ECE | TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with ECN capable * transmission (ECT). Ignore pure ack packets, * retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN | TH_FIN)) || rack->rc_in_persist) { th->th_seq = htonl(tp->snd_nxt); rack_seq = tp->snd_nxt; } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); rack_seq = tp->snd_una; } else { th->th_seq = htonl(tp->snd_max); rack_seq = tp->snd_max; } } else { th->th_seq = htonl(rsm->r_start); rack_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull the urgent * pointer to the left edge of the send window so that it * doesn't drift into the send window on sequence number * wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; if (rsm || sack_rxmit) { log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, NULL); } else lgb = NULL; /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &inp->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) mtu = inp->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) mtu = inp->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); } else if (len > 1) { int idx; idx = (len / tp->t_maxseg) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } } if (sub_from_prr && (error == 0)) { rack->r_ctl.rc_prr_sndcnt -= len; } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { #ifdef NETFLIX_STATS tcp_seq startseq = tp->snd_nxt; #endif /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto timer; if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(sb) - sb_offset, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif } /* * Set retransmit timer if not currently set, and not doing * a pure ack or a keep-alive probe. Initial value for * retransmit timer is smoothed round-trip time + 2 * * round-trip time variance. Initialize shift counter which * is used for backoff of retransmit time. */ timer: if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If the persists timer was set above (right before * the goto send), and still needs to be on. Lets * make sure all is canceled. If the persist timer * is not running, we want to get it up. */ if (rack->rc_in_persist == 0) { rack_enter_persist(tp, rack, cts); } } } else { /* * Persist case, update snd_max but since we are in persist * mode (no window) we do not update snd_nxt. */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } /* In the ENOBUFS case we do *not* update snd_max */ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt + len; } } nomore: if (error) { SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and retry in 1ms with * the hpts. Everything else will just have to retransmit * with the timer. * * In any case, we do not want to loop around for another * send without a good reason. */ sendalot = 0; switch (error) { case EPERM: tp->t_flags &= ~TF_FORCEDATA; tp->t_softerror = error; return (error); case ENOBUFS: if (slot == 0) { /* * Pace us right away to retry in a some * time */ slot = 1 + rack->rc_enobuf; if (rack->rc_enobuf < 255) rack->rc_enobuf++; if (slot > (rack->rc_rack_rtt / 2)) { slot = rack->rc_rack_rtt / 2; } if (slot < 10) slot = 10; } counter_u64_add(rack_saw_enobuf, 1); error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); } } else { rack->rc_enobuf = 0; } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: rack->r_tlp_running = 0; if ((flags & TH_RST) || (would_have_fin == 1)) { /* * We don't send again after a RST. We also do *not* send * again if we would have had a find, but now have * outstanding data. */ slot = 0; sendalot = 0; } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); } else if (sendalot) { if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; tp->t_flags &= ~TF_FORCEDATA; goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } tp->t_flags &= ~TF_FORCEDATA; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); return (error); } /* * rack_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error = 0, optval; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: case TCP_RACK_PROP: case TCP_RACK_TLP_REDUCE: case TCP_RACK_EARLY_RECOV: case TCP_RACK_PACE_ALWAYS: case TCP_DELACK: case TCP_RACK_PACE_REDUCE: case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_PRR_SENDALOT: case TCP_RACK_MIN_TO: case TCP_RACK_EARLY_SEG: case TCP_RACK_REORD_THRESH: case TCP_RACK_REORD_FADE: case TCP_RACK_TLP_THRESH: case TCP_RACK_PKT_DELAY: case TCP_RACK_TLP_USE: case TCP_RACK_TLP_INC_VAR: case TCP_RACK_IDLE_REDUCE_HIGH: case TCP_RACK_MIN_PACE: case TCP_RACK_MIN_PACE_SEG: case TCP_BBR_RACK_RTT_USE: case TCP_DATA_AFTER_CLOSE: break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: if ((optval <= 0) || (optval >= 100)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_rack_prop_rate); rack->r_ctl.rc_prop_rate = optval; break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_tlp_use); rack->rack_tlp_threshold_use = optval; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ RACK_OPTS_INC(tcp_rack_prop); rack->r_ctl.rc_prop_reduce = optval; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ RACK_OPTS_INC(tcp_rack_early_recov); rack->r_ctl.rc_early_recovery = optval; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method (bool) */ RACK_OPTS_INC(tcp_rack_pace_always); if (optval > 0) rack->rc_always_pace = 1; else rack->rc_always_pace = 0; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ RACK_OPTS_INC(tcp_rack_pace_reduce); if (optval) /* Must be non-zero */ rack->rc_pace_reduce = optval; else error = EINVAL; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_pace_max_segs = optval; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); rack->r_ctl.rc_prr_sendalot = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ RACK_OPTS_INC(tcp_rack_min_to); rack->r_ctl.rc_min_to = optval; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ RACK_OPTS_INC(tcp_rack_early_seg); rack->r_ctl.rc_early_recovery_segs = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ RACK_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) rack->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ RACK_OPTS_INC(tcp_rack_reord_fade); rack->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ RACK_OPTS_INC(tcp_rack_tlp_thresh); if (optval) rack->r_ctl.rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); rack->r_ctl.rc_pkt_delay = optval; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ RACK_OPTS_INC(tcp_rack_tlp_inc_var); rack->r_ctl.rc_prr_inc_var = optval; break; case TCP_RACK_IDLE_REDUCE_HIGH: RACK_OPTS_INC(tcp_rack_idle_reduce_high); if (optval) rack->r_idle_reduce_largest = 1; else rack->r_idle_reduce_largest = 0; break; case TCP_DELACK: if (optval == 0) tp->t_delayed_ack = 0; else tp->t_delayed_ack = 1; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; rack_output(tp); } break; case TCP_RACK_MIN_PACE: RACK_OPTS_INC(tcp_rack_min_pace); if (optval > 3) rack->r_enforce_min_pace = 3; else rack->r_enforce_min_pace = optval; break; case TCP_RACK_MIN_PACE_SEG: RACK_OPTS_INC(tcp_rack_min_pace_seg); if (optval >= 16) rack->r_min_pace_seg_thresh = 15; else rack->r_min_pace_seg_thresh = optval; break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && (optval != USE_RTT_AVG)) error = EINVAL; else rack->r_ctl.rc_rate_sample_method = optval; break; case TCP_DATA_AFTER_CLOSE: if (optval) rack->rc_allow_data_af_clo = 1; else rack->rc_allow_data_af_clo = 0; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS tcp_log_socket_option(tp, sopt->sopt_name, optval, error); #endif INP_WUNLOCK(inp); return (error); } static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ optval = rack->r_ctl.rc_prop_reduce; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ optval = rack->r_ctl.rc_tlp_cwnd_reduce; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ optval = rack->r_ctl.rc_early_recovery; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ optval = rack->rc_pace_reduce; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = rack->rc_pace_max_segs; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method */ optval = rack->rc_always_pace; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ optval = rack->r_ctl.rc_prr_sendalot; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = rack->r_ctl.rc_min_to; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ optval = rack->r_ctl.rc_early_recovery_segs; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = rack->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = rack->r_ctl.rc_pkt_delay; break; case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ optval = rack->r_ctl.rc_prr_inc_var; break; case TCP_RACK_IDLE_REDUCE_HIGH: optval = rack->r_idle_reduce_largest; break; case TCP_RACK_MIN_PACE: optval = rack->r_enforce_min_pace; break; case TCP_RACK_MIN_PACE_SEG: optval = rack->r_min_pace_seg_thresh; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_DATA_AFTER_CLOSE: optval = rack->rc_allow_data_af_clo; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (rack_set_sockopt(so, sopt, inp, tp, rack)); } else if (sopt->sopt_dir == SOPT_GET) { return (rack_get_sockopt(so, sopt, inp, tp, rack)); } out: INP_WUNLOCK(inp); return (error); } struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_tcp_do_segment = rack_do_segment, .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, .tfb_tcp_timer_stop_all = rack_stopall, .tfb_tcp_timer_activate = rack_timer_activate, .tfb_tcp_timer_active = rack_timer_active, .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok }; static const char *rack_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static int rack_ctor(void *mem, int32_t size, void *arg, int32_t how) { memset(mem, 0, size); return (0); } static void rack_dtor(void *mem, int32_t size, void *arg) { } static bool rack_mod_inited = false; static int tcp_addrack(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct rack_sendmap), rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_rack), rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&rack_sysctl_ctx); rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, __XSTRING(STACKNAME), CTLFLAG_RW, 0, ""); if (rack_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } rack_init_sysctls(); num_stacks = nitems(rack_stack_names); err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, rack_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", rack_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&rack_sysctl_ctx); free_uma: uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); rack_counter_destroy(); printf("Failed to register rack module -- err:%d\n", err); return (err); } rack_mod_inited = true; break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_rack, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_rack, false, true); if (err == EBUSY) break; if (rack_mod_inited) { uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); sysctl_ctx_free(&rack_sysctl_ctx); rack_counter_destroy(); rack_mod_inited = false; } err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_rack = { .name = __XSTRING(MODNAME), .evhand = tcp_addrack, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); Index: head/sys/netinet/tcp_stacks/tcp_rack.h =================================================================== --- head/sys/netinet/tcp_stacks/tcp_rack.h (revision 335923) +++ head/sys/netinet/tcp_stacks/tcp_rack.h (revision 335924) @@ -1,321 +1,321 @@ /*- * Copyright (c) 2016 * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_TCP_RACK_H_ #define _NETINET_TCP_RACK_H_ #define RACK_ACKED 0x0001/* The remote endpoint acked this */ #define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ #define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ #define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ #define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ #define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ #define RACK_HAS_FIN 0x0040/* segment is sent with fin */ #define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ struct rack_sendmap { TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ uint8_t r_flags; /* Flags as defined above */ uint8_t r_sndcnt; /* Retran count, not limited by * RACK_NUM_OF_RETRANS */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_resv[3]; }; TAILQ_HEAD(rack_head, rack_sendmap); /* * We use the rate sample structure to * assist in single sack/ack rate and rtt * calculation. In the future we will expand * this in BBR to do forward rate sample * b/w estimation. */ #define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ #define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */ struct rack_rtt_sample { uint32_t rs_flags; uint32_t rs_rtt_lowest; uint32_t rs_rtt_highest; uint32_t rs_rtt_cnt; uint64_t rs_rtt_tot; }; #define RACK_LOG_TYPE_ACK 0x01 #define RACK_LOG_TYPE_OUT 0x02 #define RACK_LOG_TYPE_TO 0x03 #define RACK_LOG_TYPE_ALLOC 0x04 #define RACK_LOG_TYPE_FREE 0x05 struct rack_log { union { struct rack_sendmap *rsm; /* For alloc/free */ uint64_t sb_acc;/* For out/ack or t-o */ }; uint32_t th_seq; uint32_t th_ack; uint32_t snd_una; uint32_t snd_nxt; /* th_win for TYPE_ACK */ uint32_t snd_max; uint32_t blk_start[4]; uint32_t blk_end[4]; uint8_t type; uint8_t n_sackblks; uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */ }; /* * Magic numbers for logging timeout events if the * logging is enabled. */ #define RACK_TO_FRM_TMR 1 #define RACK_TO_FRM_TLP 2 #define RACK_TO_FRM_RACK 3 #define RACK_TO_FRM_KEEP 4 #define RACK_TO_FRM_PERSIST 5 #define RACK_TO_FRM_DELACK 6 struct rack_opts_stats { uint64_t tcp_rack_prop_rate; uint64_t tcp_rack_prop; uint64_t tcp_rack_tlp_reduce; uint64_t tcp_rack_early_recov; uint64_t tcp_rack_pace_always; uint64_t tcp_rack_pace_reduce; uint64_t tcp_rack_max_seg; uint64_t tcp_rack_prr_sendalot; uint64_t tcp_rack_min_to; uint64_t tcp_rack_early_seg; uint64_t tcp_rack_reord_thresh; uint64_t tcp_rack_reord_fade; uint64_t tcp_rack_tlp_thresh; uint64_t tcp_rack_pkt_delay; uint64_t tcp_rack_tlp_inc_var; uint64_t tcp_tlp_use; uint64_t tcp_rack_idle_reduce; uint64_t tcp_rack_idle_reduce_high; uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_min_pace; }; #define TLP_USE_ID 1 /* Internet draft behavior */ #define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */ #define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ #ifdef _KERNEL #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; #define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm)) #define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1) #endif /* * As we get each SACK we wade through the * rc_map and mark off what is acked. * We also increment rc_sacked as well. * * We also pay attention to missing entries * based on the time and possibly mark them * for retransmit. If we do and we are not already * in recovery we enter recovery. In doing * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. * We also setup rc_next/rc_snd_nxt/rc_send_end so * we will know where to send from. When not in * recovery rc_next will be NULL and rc_snd_nxt should * equal snd_max. * * Whenever we retransmit from recovery we increment * rc_holes_rxt as we retran a block and mark it as retransmitted * with the time it was sent. During non-recovery sending we * add to our map and note the time down of any send expanding * the rc_map at the tail and moving rc_snd_nxt up with snd_max. * * In recovery during SACK/ACK processing if a chunk has * been retransmitted and it is now acked, we decrement rc_holes_rxt. * When we retransmit from the scoreboard we use * rc_next and rc_snd_nxt/rc_send_end to help us * find what needs to be retran. * * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt * This gets us the effect of RFC6675 pipe, counting twice for * bytes retransmitted. */ #define TT_RACK_FR_TMR 0x2000 /* * Locking for the rack control block. * a) Locked by INP_WLOCK * b) Locked by the hpts-mutex * */ struct rack_control { /* Second cache line 0x40 from tcp_rack */ struct rack_head rc_map;/* List of all segments Lock(a) */ struct rack_head rc_tmap; /* List in transmit order Lock(a) */ struct rack_sendmap *rc_tlpsend; /* Remembered place for * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ uint32_t rc_hpts_flags; uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */ /* Third Cache line 0x80 */ struct rack_head rc_free; /* Allocation array */ uint32_t rc_time_last_sent; /* Time we last sent some data and * logged it Lock(a). */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP * Lock(a) */ uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */ uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */ uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done * since peer spoke to us Lock(a) */ uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent * rc_last_tlp_seq Lock(a) */ uint32_t rc_loss_count; /* During recovery how many segments were lost * Lock(a) */ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ /* Forth cache line 0xc0 */ /* Times */ uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ /* Variables to track bad retransmits and recover */ uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */ uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */ uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */ uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we * have allocated */ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_notused; uint32_t rc_last_output_to; uint32_t rc_went_idle_time; struct rack_sendmap *rc_sacklast; /* sack remembered place * Lock(a) */ struct rack_sendmap *rc_next; /* remembered place where we next * retransmit at Lock(a) */ struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for * cache line alignment * Lock(a) */ /* Cache line split 0x100 */ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ struct rack_rtt_sample rack_rs; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ uint8_t rc_prop_rate; /* Socket option value Lock(a) */ uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */ uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ uint8_t rc_rate_sample_method; }; #ifdef _KERNEL struct tcp_rack { /* First cache line 0x00 */ TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, - int32_t, int32_t, int32_t *, uint32_t, int, int); /* Lock(a) */ + int32_t, int32_t, uint32_t, int, int); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ uint32_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ uint16_t r_wanted_output; /* Output routine wanted to be called */ uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */ uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */ uint16_t rc_pace_reduce;/* Socket option value Lock(a) */ uint8_t r_state; /* Current rack state Lock(a) */ uint8_t rc_tmr_stopped : 7, t_timers_stopped : 1; uint8_t rc_enobuf; /* count of enobufs on connection provides * backoff Lock(a) */ uint8_t r_timer_override : 1, /* hpts override Lock(a) */ r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */ r_is_v6 : 1, /* V6 pcb Lock(a) */ rc_in_persist : 1, rc_last_pto_set : 1, /* XXX not used */ rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ rc_timer_up : 1; /* The rack timer is up flag Lock(a) */ uint8_t r_idle_reduce_largest : 1, r_enforce_min_pace : 2, r_min_pace_seg_thresh : 5; uint8_t rack_tlp_threshold_use; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, rc_avail : 6; uint8_t r_resv[2]; /* Fill to cache line boundary */ /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); #endif #endif Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 335923) +++ head/sys/netinet/tcp_subr.c (revision 335924) @@ -1,3221 +1,3226 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif static int tcp_default_fb_init(struct tcpcb *tp); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, }; int t_functions_inited = 0; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; static void init_tcp_functions(void) { if (t_functions_inited == 0) { TAILQ_INIT(&t_functions); rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); t_functions_inited = 1; } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); /* * Release the old stack. This function will either find a new one * or panic. */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init == NULL || (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) return; /* * Initialization failed. Release the reference count on * the stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init != NULL && (*tp->t_fb->tfb_tcp_fb_init)(tp)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strncpy(tfi.tfi_alias, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); tfi.tfi_alias[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; (void)strncpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp) { struct socket *so; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ so = tp->t_inpcb->inp_socket; if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tp->t_inpcb); return; } /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; #ifdef TCP_HHOOK struct osd osd; #endif }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); if (t_functions_inited == 0) { init_tcp_functions(); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { *num_names = 0; return (EINVAL); } } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_flags = 0; blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strncpy(fs.function_set_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); fs.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed, or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { INP_WUNLOCK(inp); continue; } tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) { INP_WUNLOCK(inp); continue; } tcp_switch_back_to_default(tp); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose && IS_DEFAULT_VNET(curvnet)) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; tcp_reass_global_init(); /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; /* Setup the tcp function block list */ init_tcp_functions(); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); #ifdef TCPPCAP tcp_pcap_init(); #endif } #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_LIST_RLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win; bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if (!M_WRITABLE(m)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); (void)ip_output(m, NULL, NULL, 0, NULL, inp); } #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef TCP_HHOOK tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #endif #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ CK_LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ int released __unused; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tp->t_timers->tt_draincnt = 0; tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } void tcp_timer_discard(void *ptp) { struct inpcb *inp; struct tcpcb *tp; + struct epoch_tracker et; tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, ("%s: tcpcb has to be stopped here", __func__)); tp->t_timers->tt_draincnt--; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on this tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inpb); if (inpb->inp_flags & INP_TIMEWAIT) { INP_WUNLOCK(inpb); continue; } if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } INP_WUNLOCK(inpb); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); inp->inp_route.ro_rt = (struct rtentry *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); INP_INFO_WLOCK(&V_tcbinfo); for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { in_pcbref(inp); inp_list[i++] = inp; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; + struct epoch_tracker et; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } icmp_tcp_seq = th->th_seq; if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. */ if (!mtu) mtu = ip_next_mtu( ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq); } out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET */ #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; + struct epoch_tracker et; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; dst = NULL; } if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip6 = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) return; if (ip6 == NULL) { in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); return; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was * proposed, or the proposed * MTU was too small, set to * the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, inet6ctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq); } out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct tcpcb *tp) { MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { tcp_mtudisc(inp, -1); return (inp); } static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); tp->t_fb->tfb_tcp_output(tp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop4_extended nh4; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, NHR_REF, 0, &nh4) != 0) return (0); ifp = nh4.nh_ifp; maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop6_extended nh6; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, 0, &nh6) != 0) return (0); ifp = nh6.nh_ifp; maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PAD(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; + struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; if (inp->inp_flags & INP_TIMEWAIT) { bzero(xt, sizeof(struct xtcpcb)); xt->t_state = TCPS_TIME_WAIT; } else { xt->t_state = tp->t_state; xt->t_logstate = tp->t_logstate; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; now = getsbinuptime(); #define COPYTIMER(ttt) do { \ if (callout_active(&tp->t_timers->ttt)) \ xt->ttt = (tp->t_timers->ttt.c_time - now) / \ SBT_1MS; \ else \ xt->ttt = 0; \ } while (0) COPYTIMER(tt_delack); COPYTIMER(tt_rexmt); COPYTIMER(tt_persist); COPYTIMER(tt_keep); COPYTIMER(tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bzero(xt->xt_logid, TCP_LOG_ID_LEN); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif } xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 335923) +++ head/sys/netinet/tcp_timer.c (revision 335924) @@ -1,1094 +1,1064 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); int tcp_always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); VNET_DEFINE(int, tcp_pmtud_blackhole_detect); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); #ifdef INET VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } -/* - * When a timer wants to remove a TCB it must - * hold the INP_INFO_RLOCK(). The timer function - * should only have grabbed the INP_WLOCK() when - * it entered. To safely switch to holding both the - * INP_INFO_RLOCK() and the INP_WLOCK() we must first - * grab a reference on the inp, which will hold the inp - * so that it can't be removed. We then unlock the INP_WLOCK(), - * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() - * we proceed again to get the INP_WLOCK() (this preserves proper - * lock order). After acquiring the INP_WLOCK we must check if someone - * else deleted the pcb i.e. the inp_flags check. - * If so we return 1 otherwise we return 0. - * - * No matter what the tcp_inpinfo_lock_add() function - * returns the caller must afterwards call tcp_inpinfo_lock_del() - * to drop the locks and reference properly. - */ - -int -tcp_inpinfo_lock_add(struct inpcb *inp) -{ - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - return(1); - } - return(0); - -} - void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) { - INP_INFO_RUNLOCK(&V_tcbinfo); - if (inp && (tp == NULL)) { - /* - * If tcp_close/drop() gets called and tp - * returns NULL, then the function dropped - * the inp lock, we hold a reference keeping - * this around, so we must re-aquire the - * INP_WLOCK() in order to proceed with - * our dropping the inp reference. - */ - INP_WLOCK(inp); - } - if (inp && in_pcbrele_wlocked(inp) == 0) + if (inp && tp != NULL) INP_WUNLOCK(inp); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_close(tp); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_close(tp); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } - } + } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Because we don't regularly reset the keepalive callout in * the ESTABLISHED state, it may be that we don't actually need * to send a keepalive yet. If that occurs, schedule another * call for the next time the keepalive timer might expire. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { u_int idletime; idletime = ticks - tp->t_rcvtime; if (idletime < TP_KEEPIDLE(tp)) { callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } } /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp); } else callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); - - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); -out: + out: CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; struct inpcb *inp; + struct epoch_tracker et; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); if (tp->t_fb->tfb_tcp_rexmit_tmr) { /* The stack has a timer action too. */ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; if ((tp->t_flags & TF_RCVD_TSTMP) == 0) tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); /* In the event that we've negotiated timestamps * badrxtwin will be set to the value that we set * the retransmitted packet's to_tsval to by tcp_output */ tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, notify the L3 protocol that we're having * connection problems. */ if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tp->t_fb->tfb_tcp_output(tp); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { callout_stop(t_callout); } else { callout_reset_on(t_callout, delta, f_callout, tp, cpu); } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_active) { return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } /* * Stop the timer from running, and apply a flag * against the timer_flags that will force the * timer never to run. The flag is needed to assure * a race does not leave it running and cause * the timer to possibly restart itself (keep and persist * especially do this). */ int tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; uint32_t t_flags; switch (timer_type) { case TT_DELACK: t_flags = TT_DELACK_SUS; t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_flags = TT_REXMT_SUS; t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_flags = TT_PERSIST_SUS; t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_flags = TT_KEEP_SUS; t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_flags = TT_2MSL_SUS; t_callout = &tp->t_timers->tt_2msl; break; default: panic("tp:%p bad timer_type 0x%x", tp, timer_type); } tp->t_timers->tt_flags |= t_flags; return (callout_stop(t_callout)); } void tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) { switch (timer_type) { case TT_DELACK: if (tp->t_timers->tt_flags & TT_DELACK_SUS) { tp->t_timers->tt_flags &= ~TT_DELACK_SUS; if (tp->t_flags & TF_DELACK) { /* Delayed ack timer should be up activate a timer */ tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } } break; case TT_REXMT: if (tp->t_timers->tt_flags & TT_REXMT_SUS) { tp->t_timers->tt_flags &= ~TT_REXMT_SUS; if (SEQ_GT(tp->snd_max, tp->snd_una) && (tcp_timer_active((tp), TT_PERSIST) == 0) && tp->snd_wnd) { /* We have outstanding data activate a timer */ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } } break; case TT_PERSIST: if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; if (tp->snd_wnd == 0) { /* Activate the persists timer */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } break; case TT_KEEP: if (tp->t_timers->tt_flags & TT_KEEP_SUS) { tp->t_timers->tt_flags &= ~TT_KEEP_SUS; tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); } break; case TT_2MSL: if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { tp->t_timers->tt_flags &= ~TT_2MSL_SUS; if ((tp->t_state == TCPS_FIN_WAIT_2) && ((tp->t_inpcb->inp_socket == NULL) || (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { /* Star the 2MSL timer */ tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp)); } } break; default: panic("tp:%p bad timer_type 0x%x", tp, timer_type); } } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_stop) { /* * XXXrrs we need to look at this with the * stop case below (flags). */ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { /* * Can't stop the callout, defer tcpcb actual deletion * to the last one. We do this using the async drain * function and incrementing the count in */ tp->t_timers->tt_draincnt++; } } Index: head/sys/netinet/tcp_timer.h =================================================================== --- head/sys/netinet/tcp_timer.h (revision 335923) +++ head/sys/netinet/tcp_timer.h (revision 335924) @@ -1,232 +1,231 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet * received. If an ACK is received which advances tp->snd_una, * then the retransmit timer is cleared (if there are no more * outstanding segments) or reset to the base value (if there * are more ACKs expected). Whenever the retransmit timer goes off, * we retransmit one unacknowledged segment, and do a backoff * on the retransmit timer. * * The TCPT_PERSIST timer is used to keep window size information * flowing even if the window goes shut. If all previous transmissions * have been acknowledged (so that there are no retransmissions in progress), * and the window is too small to bother sending anything, then we start * the TCPT_PERSIST timer. When it expires, if the window is nonzero, * we go to transmit state. Otherwise, at intervals send a single byte * into the peer's window to force him to update our window information. * We do this at most as often as TCPT_PERSMIN time intervals, * but no more frequently than the current estimate of round-trip * packet time. The TCPT_PERSIST timer is cleared whenever we receive * a window update from the peer. * * The TCPT_KEEP timer is used to keep connections alive. If an * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, * but not yet established, then we drop the connection. Once the connection * is established, if the connection is idle for TCPTV_KEEP_IDLE time * (and keepalives have been enabled on the socket), we begin to probe * the connection. We force the peer to send us a segment by sending: * * This segment is (deliberately) outside the window, and should elicit * an ack segment in response from the peer. If, despite the TCPT_KEEP * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE * amount of time probing, then we drop the connection. */ /* * Time constants. */ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with * the expected worst-case processing variances by the kernels * representing the end points. Such variances do not always show * up in the srtt because the timestamp is often calculated at * the interface rather then at the TCP layer. This value is * typically 50ms. However, it is also possible that delayed * acks (typically 100ms) could create issues so we set the slop * to 200ms to try to cover it. Note that, properly speaking, * delayed-acks should not create a major issue for interactive * environments which 'P'ush the last segment, at least as * long as implementations do the required 'at least one ack * for every two packets' for the non-interactive streaming case. * (maybe the RTO calculation should use 2*RTT instead of RTT * to handle the ack-every-other-packet case). * * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ #define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ #define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ /* * If we exceed this number of retransmits for a single segment, we'll consider * the current srtt measurement no longer valid and will recalculate from * scratch starting with the next ACK. */ #define TCP_RTT_INVALIDATE (TCP_MAXRXTSHIFT / 4) #ifdef TCPTIMERS static const char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* * Force a time value to be in a certain range. */ #define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value) + tcp_rexmit_slop; \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) #ifdef _KERNEL struct xtcp_timer; struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ uint32_t tt_flags; /* Timers flags */ uint32_t tt_draincnt; /* Count being drained */ }; /* * Flags for the tt_flags field. */ #define TT_DELACK 0x0001 #define TT_REXMT 0x0002 #define TT_PERSIST 0x0004 #define TT_KEEP 0x0008 #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) /* * Suspend flags - used when suspending a timer * from ever running again. */ #define TT_DELACK_SUS 0x0100 #define TT_REXMT_SUS 0x0200 #define TT_PERSIST_SUS 0x0400 #define TT_KEEP_SUS 0x0800 #define TT_2MSL_SUS 0x1000 #define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; extern int tcp_totbackoff; extern int tcp_rexmit_drop_options; extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; VNET_DECLARE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) VNET_DECLARE(int, tcp_pmtud_blackhole_mss); #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) -int tcp_inpinfo_lock_add(struct inpcb *inp); void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); void tcp_timer_discard(void *); struct tcptw * tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */ Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 335923) +++ head/sys/netinet/tcp_timewait.c (revision 335924) @@ -1,753 +1,755 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #ifndef INVARIANTS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #include #include static VNET_DEFINE(uma_zone_t, tcptw_zone); #define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * timewait lock, which must be held over queue iteration and modification. * * Rules on tcptw usage: * - a inpcb is always freed _after_ its tcptw * - a tcptw relies on its inpcb reference counting for memory stability * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) /* Global timewait lock */ static VNET_DEFINE(struct rwlock, tw_lock); #define V_tw_lock VNET(tw_lock) #define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) #define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) #define TW_RLOCK(tw) rw_rlock(&(tw)) #define TW_WLOCK(tw) rw_wlock(&(tw)) #define TW_RUNLOCK(tw) rw_runlock(&(tw)) #define TW_WUNLOCK(tw) rw_wunlock(&(tw)) #define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) #define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) #define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) #define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *, int); static int tcp_twrespond(struct tcptw *, int); static int tcptw_auto_size(void) { int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ if (V_ipport_lastauto > V_ipport_firstauto) halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; else halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = tcptw_auto_size(); else new = maxtcptw; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; uma_zone_set_max(V_tcptw_zone, maxtcptw); } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); static VNET_DEFINE(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); void tcp_tw_zone_change(void) { if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); } void tcp_tw_init(void) { V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE void tcp_tw_destroy(void) { struct tcptw *tw; + struct epoch_tracker et; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw twlocal, *tw; struct inpcb *inp = tp->t_inpcb; struct socket *so; bool acknow, local; #ifdef INET6 bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* A dropped inp should never transition to TIME_WAIT state. */ KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: " "(inp->inp_flags & INP_DROPPED) != 0")); if (V_nolocaltimewait) { #ifdef INET6 if (isipv6) local = in6_localaddr(&inp->in6p_faddr); else #endif #ifdef INET local = in_localip(inp->inp_faddr); #else local = false; #endif } else local = false; /* * For use only by DTrace. We do not reference the state * after this point so modifying it in place is not a problem. */ tcp_state_change(tp, TCPS_TIME_WAIT); if (local) tw = &twlocal; else tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { /* * Reached limit on total number of TIMEWAIT connections * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. * * XXX: Check if it possible to always have enough room * in advance based on guarantees provided by uma_zalloc(). */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } /* * For !local case the tcptw will hold a reference on its inpcb * until tcp_twclose is called. */ tw->tw_inpcb = inp; /* * Recover last window size sent. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; else tw->last_win = 0; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); tw->tw_so_options = so->so_options; inp->inp_flags |= INP_TIMEWAIT; if (acknow) tcp_twrespond(tw, TH_ACK); if (local) in_pcbdrop(inp); else { in_pcbref(inp); /* Reference from tw */ tw->tw_cred = crhold(so->so_cred); inp->inp_ppcb = tw; TCPSTATES_INC(TCPS_TIME_WAIT); tcp_tw_2msl_reset(tw, 0); } /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_WUNLOCK(inp); } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); drop: INP_WUNLOCK(inp); m_freem(m); return (0); } void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_WUNLOCK(inp); } } else { /* * The socket has been already cleaned-up for us, only free the * inpcb. */ in_pcbfree(inp); } TCPSTAT_INC(tcps_closed); } static int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; #if defined(INET6) || defined(INET) struct tcphdr *th = NULL; #endif struct mbuf *m; #ifdef INET struct ip *ip = NULL; #endif u_int hdrlen, optlen; int error = 0; /* Keep compiler happy */ struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif hdrlen = 0; /* Keep compiler happy */ INP_WLOCK_ASSERT(inp); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } #endif to.to_flags = 0; /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; to.to_tsval = tcp_ts_getticks() + tw->ts_offset; to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } #endif if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); TW_WUNLOCK(V_tw_lock); } static void tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { struct ucred *cred; struct inpcb *inp; int released __unused; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TW_WLOCK(V_tw_lock); inp = tw->tw_inpcb; tw->tw_inpcb = NULL; TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); cred = tw->tw_cred; tw->tw_cred = NULL; TW_WUNLOCK(V_tw_lock); if (cred != NULL) crfree(cred); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp should not be released here", __func__)); if (!reuse) uma_zfree(V_tcptw_zone, tw); TCPSTATES_DEC(TCPS_TIME_WAIT); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; struct inpcb *inp; + struct epoch_tracker et; #ifdef INVARIANTS if (reuse) { /* * Exclusive pcbinfo lock is not required in reuse case even if * two inpcb locks can be acquired simultaneously: * - the inpcb transitioning to TIME_WAIT state in * tcp_tw_start(), * - the inpcb closed by tcp_twclose(). * * It is because only inpcbs in FIN_WAIT2 or CLOSING states can * transition in TIME_WAIT state. Then a pcbcb cannot be in * TIME_WAIT list and transitioning to TIME_WAIT state at same * time. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #endif for (;;) { TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { TW_RUNLOCK(V_tw_lock); break; } KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", __func__)); inp = tw->tw_inpcb; in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tw = intotw(inp); if (in_pcbrele_wlocked(inp)) { if (__predict_true(tw == NULL)) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); continue; } else { /* This should not happen as in TIMEWAIT * state the inp should not be destroyed * before its tcptw. If INVARIANTS is * defined panic. */ #ifdef INVARIANTS panic("%s: Panic before an infinite " "loop: INP_TIMEWAIT && (INP_FREED " "|| inp last reference) && tw != " "NULL", __func__); #else log(LOG_ERR, "%s: Avoid an infinite " "loop: INP_TIMEWAIT && (INP_FREED " "|| inp last reference) && tw != " "NULL", __func__); #endif - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); break; } } if (tw == NULL) { /* tcp_twclose() has already been called */ INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); continue; } tcp_twclose(tw, reuse); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (reuse) return tw; } return NULL; } Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c (revision 335923) +++ head/sys/netinet/tcp_usrreq.c (revision 335924) @@ -1,2498 +1,2506 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include /* * TCP protocol interface to socket abstraction. */ static int tcp_attach(struct socket *); #ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); #ifdef TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() #define TCPDEBUG2(req) #endif /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); TCPDEBUG1(); error = tcp_attach(so); if (error) goto out; if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; inp = sotoinpcb(so); tp = intotcpcb(inp); out: TCPDEBUG2(PRU_ATTACH); TCP_PROBE2(debug__user, tp, PRU_ATTACH); return error; } /* * tcp_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. * * This function can probably be re-absorbed back into tcp_usr_detach() now * that there is a single detach path. */ static void tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); tp = intotcpcb(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * There are two cases to handle: one in which the time wait * state is being discarded (INP_DROPPED), and one in which * this connection will remain in timewait. In the former, * it is time to discard all state (except tcptw, which has * already been discarded by the timewait close code, which * should be further up the call stack somewhere). In the * latter case, we detach from the socket, but leave the pcb * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? * * Astute question indeed, from twtcp perspective there are * four cases to consider: * * #1 tcp_detach is called at tcptw creation time by * tcp_twstart, then do not discard the newly created tcptw * and leave inpcb present until timewait ends * #2 tcp_detach is called at tcptw creation time by * tcp_twstart, but connection is local and tw will be * discarded immediately * #3 tcp_detach is called at timewait end (or reuse) by * tcp_twclose, then the tcptw has already been discarded * (or reused) and inpcb is freed here * #4 tcp_detach is called() after timewait ends (or reuse) * (e.g. by soclose), then tcptw has already been discarded * (or reused) and inpcb is freed here * * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { in_pcbdetach(inp); if (__predict_true(tp == NULL)) { in_pcbfree(inp); } else { /* * This case should not happen as in TIMEWAIT * state the inp should not be destroyed before * its tcptw. If INVARIANTS is defined, panic. */ #ifdef INVARIANTS panic("%s: Panic before an inp double-free: " "INP_TIMEWAIT && INP_DROPPED && tp != NULL" , __func__); #else log(LOG_ERR, "%s: Avoid an inp double-free: " "INP_TIMEWAIT && INP_DROPPED && tp != NULL" , __func__); #endif INP_WUNLOCK(inp); } } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } else { /* * If the connection is not in timewait, we consider two * two conditions: one in which no further processing is * necessary (dropped || embryonic), and one in which TCP is * not yet done, but no longer requires the socket, so the * pcb will persist for the time being. * * XXXRW: Does the second case still occur? */ if (inp->inp_flags & INP_DROPPED || tp->t_state < TCPS_SYN_SENT) { tcp_discardcb(tp); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } } /* * pru_detach() detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a pru_disconnect(), * which may finish later; embryonic TCB's can just * be discarded here. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; int rlock = 0; + struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); if (!INP_INFO_WLOCKED(&V_tcbinfo)) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); rlock = 1; } INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); if (rlock) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #ifdef INET /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) return (error); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; TCPDEBUG0; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif error = tp->t_fb->tfb_tcp_output(tp); goto out; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; int error = 0; TCPDEBUG0; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) goto out; if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } #ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct in_addr addr; in_port_t port = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #endif /* INET */ #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; + struct epoch_tracker et; in_port_t port = 0; int v4 = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early * SYN|ACK. It is preferable to have the SYN|ACK be sent along with * application response data, or failing that, when the DELACK timer * expires. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); TCP_PROBE2(debug__user, tp, PRU_RCVD); INP_WUNLOCK(inp); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker net_et; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* * We require the pcbinfo lock if we will close the socket as part of * this call. */ if (flags & PRUS_EOF) - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, net_et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); /* * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible * for freeing memory. */ if (m && (flags & PRUS_NOTREADY) == 0) m_freem(m); error = ECONNRESET; goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, nam, td); #endif if (error) goto out; if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_connect(tp); else { tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (!(inp->inp_flags & INP_DROPPED) && !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ /* * Not going to contemplate SYN|URG */ if (IS_FASTOPEN(tp->t_flags)) tp->t_flags &= ~TF_FASTOPEN; #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, nam, td); #endif if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if (!(flags & PRUS_NOTREADY)) { tp->t_flags |= TF_FORCEDATA; error = tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } } TCP_LOG_EVENT(tp, NULL, &inp->inp_socket->so_rcv, &inp->inp_socket->so_snd, TCP_LOG_USERSEND, error, 0, NULL, false); out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, net_et); return (error); } static int tcp_usr_ready(struct socket *so, struct mbuf *m, int count) { struct inpcb *inp; struct tcpcb *tp; int error; inp = sotoinpcb(so); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); for (int i = 0; i < count; i++) m = m_free(m); return (ECONNRESET); } tp = intotcpcb(inp); SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error == 0) error = tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tp = tcp_drop(tp, ECONNABORTED); if (tp == NULL) goto dropped; TCPDEBUG2(PRU_ABORT); TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); dropped: - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCPDEBUG2(PRU_RCVOOB); TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp_usr_bind, .pru_connect = tcp_usr_connect, .pru_control = in_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp6_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp6_usr_bind, .pru_connect = tcp6_usr_connect, .pru_control = in6_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct in_addr laddr; u_short lport; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) goto out; if (oinp) { error = EADDRINUSE; goto out; } inp->inp_laddr = laddr; in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } error = in6_pcbconnect(inp, nam, td->td_ucred); if (error != 0) goto out; INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return error; } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { INP_WLOCK_ASSERT(tp->t_inpcb); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } if (tp->t_flags & TF_ECN_PERMIT) ti->tcpi_options |= TCPI_OPT_ECN; ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; tcp_offload_tcp_info(tp, ti); } #endif } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error; struct inpcb *inp; struct tcpcb *tp; struct tcp_function_block *blk; struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Protect the TCP option TCP_FUNCTION_BLK so * that a sub-function can *never* overwrite this. */ if ((sopt->sopt_dir == SOPT_SET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK_RECHECK(inp); blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); return (ENOENT); } if (tp->t_fb == blk) { /* You already have this */ refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (0); } if (tp->t_state != TCPS_CLOSED) { int error=EINVAL; /* * The user has advanced the state * past the initial point, we may not * be able to switch. */ if (blk->tfb_tcp_handoff_ok != NULL) { /* * Does the stack provide a * query mechanism, if so it may * still be possible? */ error = (*blk->tfb_tcp_handoff_ok)(tp); } if (error) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return(error); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } /* * Release the old refcnt, the * lookup acquired a ref on the * new one already. */ if (tp->t_fb->tfb_tcp_fb_fini) { /* * Tell the stack to cleanup with 0 i.e. * the tcb is not going away. */ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); } #ifdef TCPHPTS /* Assure that we are not on any hpts */ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp); if (error) { refcount_release(&blk->tfb_refcnt); if (tp->t_fb->tfb_tcp_fb_init) { if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { /* Fall back failed, drop the connection */ INP_WUNLOCK(inp); soabort(so); return(error); } } goto err_out; } } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif err_out: INP_WUNLOCK(inp); return (error); } else if ((sopt->sopt_dir == SOPT_GET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } /* Pass in the INP locked, called must unlock it */ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); } /* * If this assert becomes untrue, we need to change the size of the buf * variable in tcp_default_ctloutput(). */ #ifdef CTASSERT CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int error, opt, optval; u_int ui; struct tcp_info ti; struct cc_algo *algo; char *pbuf, buf[TCP_LOG_ID_LEN]; size_t len; /* * For TCP_CCALGOOPT forward the control to CC module, for both * SOPT_SET and SOPT_GET. */ switch (sopt->sopt_name) { case TCP_CCALGOOPT: INP_WUNLOCK(inp); pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, sopt->sopt_valsize); if (error) { free(pbuf, M_TEMP); return (error); } INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); if (CC_ALGO(tp)->ctl_output != NULL) error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); else error = ENOENT; INP_WUNLOCK(inp); if (error == 0 && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); free(pbuf, M_TEMP); return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); if (error) return (error); goto unlock_and_done; #endif /* IPSEC */ case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; unlock_and_done: #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); break; case TCP_NOPUSH: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_CONGESTION: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) break; CC_LIST_RUNLOCK(); if (algo == NULL) { INP_WUNLOCK(inp); error = EINVAL; break; } /* * We hold a write lock over the tcb so it's safe to * do these things without ordering concerns. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_ALGO(tp) = algo; /* * If something goes pear shaped initialising the new * algo, fall back to newreno (which does not * require initialisation). */ if (algo->cb_init != NULL && algo->cb_init(tp->ccv) != 0) { CC_ALGO(tp) = &newreno_cc_algo; /* * The only reason init should fail is * because of malloc. */ error = ENOMEM; } INP_WUNLOCK(inp); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); if (ui > (UINT_MAX / hz)) { error = EINVAL; break; } ui *= hz; INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_KEEPIDLE: tp->t_keepidle = ui; /* * XXX: better check current remaining * timeout and "merge" it with new value. */ if ((tp->t_state > TCPS_LISTEN) && (tp->t_state <= TCPS_CLOSING)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); break; case TCP_KEEPINTVL: tp->t_keepintvl = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); break; case TCP_KEEPINIT: tp->t_keepinit = ui; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); break; } goto unlock_and_done; case TCP_KEEPCNT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); tp->t_keepcnt = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); goto unlock_and_done; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval >= 0) tcp_pcap_set_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else error = EINVAL; goto unlock_and_done; #endif case TCP_FASTOPEN: { struct tcp_fastopen tfo_optval; INP_WUNLOCK(inp); if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) return (EPERM); error = sooptcopyin(sopt, &tfo_optval, sizeof(tfo_optval), sizeof(int)); if (error) return (error); INP_WLOCK_RECHECK(inp); if (tfo_optval.enable) { if (tp->t_state == TCPS_LISTEN) { if (!V_tcp_fastopen_server_enable) { error = EPERM; goto unlock_and_done; } tp->t_flags |= TF_FASTOPEN; if (tp->t_tfo_pending == NULL) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); } else { /* * If a pre-shared key was provided, * stash it in the client cookie * field of the tcpcb for use during * connect. */ if (sopt->sopt_valsize == sizeof(tfo_optval)) { memcpy(tp->t_tfo_cookie.client, tfo_optval.psk, TCP_FASTOPEN_PSK_LEN); tp->t_tfo_client_cookie_len = TCP_FASTOPEN_PSK_LEN; } tp->t_flags |= TF_FASTOPEN; } } else tp->t_flags &= ~TF_FASTOPEN; goto unlock_and_done; } #ifdef TCP_BLACKBOX case TCP_LOG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); error = tcp_log_state_change(tp, optval); goto unlock_and_done; case TCP_LOGBUF: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_LOGID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); error = tcp_log_set_id(tp, buf); /* tcp_log_set_id() unlocks the INP. */ break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); if (sopt->sopt_name == TCP_LOGDUMP) { error = tcp_log_dump_tp_logbuf(tp, buf, M_WAITOK, true); INP_WUNLOCK(inp); } else { tcp_log_dump_tp_bucket_logbufs(tp, buf); /* * tcp_log_dump_tp_bucket_logbufs() drops the * INP lock. */ } break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; case TCP_KEEPINTVL: ui = TP_KEEPINTVL(tp) / hz; break; case TCP_KEEPINIT: ui = TP_KEEPINIT(tp) / hz; break; case TCP_KEEPCNT: ui = TP_KEEPCNT(tp); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #ifdef TCP_BLACKBOX case TCP_LOG: optval = tp->t_logstate; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_LOGBUF: /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ error = tcp_log_getlogbuf(sopt, tp); break; case TCP_LOGID: len = tcp_log_get_id(tp, buf); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = EINVAL; break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #undef INP_WLOCK_RECHECK #undef INP_WLOCK_RECHECK_CLEANUP /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, * bufer space, and entering LISTEN state if to accept connections. */ static int tcp_attach(struct socket *so) { struct tcpcb *tp; struct inpcb *inp; + struct epoch_tracker et; int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) return (error); } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } inp = sotoinpcb(so); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ } else #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); TCPSTATES_INC(TCPS_CLOSED); return (0); } /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) tp->t_fb->tfb_tcp_output(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { case TCPS_LISTEN: #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LQ_OVERFLOW) { db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_CONGRECOVERY) { db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } if (t_flags & TF_ECN_PERMIT) { db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); db_print_indent(indent); db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, &tp->t_timers->tt_delack, tp->t_inpcb); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %u snd_cwnd: %u\n", tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); db_printf("snd_ssthresh: %u snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_rcvtime: %u t_startime: %u\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); db_printf("t_rttupdated: %lu max_sndwnd: %u t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %u\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 335923) +++ head/sys/netinet/tcp_var.h (revision 335924) @@ -1,966 +1,964 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { LIST_ENTRY(tseg_qent) tqe_q; int tqe_len; /* TCP segment data length */ struct tcphdr *tqe_th; /* a pointer to tcp header */ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int ispare; /* explicit pad for 64bit alignment */ int sacked_bytes; /* * Total sacked bytes reported by the * receiver via sack option */ uint32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ }; STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ bits_spare : 4; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ uint32_t t_logsn; /* Log "serial number" */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, - int, int, uint8_t, - int); + int, int, uint8_t); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, - int, int, struct timeval *); + int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x000001 /* ack peer immediately */ #define TF_DELACK 0x000002 /* ack, but try to delay it */ #define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x000008 /* don't use tcp options */ #define TF_SENTFIN 0x000010 /* have sent FIN */ #define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ #define TF_TSO 0x1000000 /* TSO enabled on this connection */ #define TF_TOE 0x2000000 /* this connection is offloaded */ #define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */ #define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */ #define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t _pad[12]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { size_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ int64_t spare64[8]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ int32_t spare32[32]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif extern int tcp_log_in_vain; /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_inc); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_rfc6675_pipe); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); + struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; tcp_seq tcp_new_isn(struct tcpcb *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 335923) +++ head/sys/netinet/udp_usrreq.c (revision 335924) @@ -1,1799 +1,1802 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); #endif static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_udpcb_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } static int udplite_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpliteinp"); return (0); } void udp_init(void) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void udplite_init(void) { in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, IPI_HASHFIELDS_2TUPLE); } /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } int udp_newudpcb(struct inpcb *inp) { struct udpcb *up; up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); if (up == NULL) return (ENOBUFS); inp->inp_ppcb = up; return (0); } void udp_discardcb(struct udpcb *up) { uma_zfree(V_udpcb_zone, up); } #ifdef VIMAGE static void udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); static void udplite_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ulitecbinfo); } VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy, NULL); #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. * * In the normal case udp_append() will return 0, indicating that you * must unlock the inp. However if a tunneling protocol is in place we increment * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we * then decrement the reference count. If the inp_rele returns 1, indicating the * inp is gone, we return that to the caller to tell them *not* to unlock * the inp. In the case of multi-cast this will cause the distribution * to stop (though most tunneling protocols known currently do *not* use * multicast). */ static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *tmpopts, *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0], up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } off += sizeof(struct udphdr); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) { m_freem(n); return (0); } if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */ if (IPSEC_ENABLED(ipv4) && UDPENCAP_INPUT(n, off, AF_INET) != 0) return (0); /* Consumed. */ } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmpopts = sbcreatecontrol((caddr_t)&udp_in[1], sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP); if (tmpopts) { if (opts) { tmpopts->m_next = opts; opts = tmpopts; } else opts = tmpopts; } } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)&udp_in[0]; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; + struct epoch_tracker et; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2); udp_in[0].sin_len = sizeof(struct sockaddr_in); udp_in[0].sin_family = AF_INET; udp_in[0].sin_port = uh->uh_sport; udp_in[0].sin_addr = ip->ip_src; udp_in[1].sin_len = sizeof(struct sockaddr_in); udp_in[1].sin_family = AF_INET; udp_in[1].sin_port = uh->uh_dport; udp_in[1].sin_addr = ip->ip_dst; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_RLOCK_ET(pcbinfo, et); pcblist = udp_get_pcblist(proto); last = NULL; CK_LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, n, iphlen, udp_in)) { goto inp_lost; } } INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); goto badunlocked; } UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(last); inp_lost: - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport), inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); inp->inp_route.ro_rt = (struct rtentry *)NULL; } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify); return; } /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { struct udpcb *up; void *ctx; udp_tun_icmp_t func; up = intoudpcb(inp); ctx = up->u_tun_ctx; func = up->u_icmp_func; INP_RUNLOCK(inp); if (func != NULL) (*func)(cmd, sa, vip, ctx); } } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); } void udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK_ET(&V_udbinfo, et); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK_ET(&V_udbinfo, et); for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK_ET(&V_udbinfo, et); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET case UDP_ENCAP: if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(inp, sopt); break; #endif /* INET */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET case UDP_ENCAP: if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(inp, sopt); break; #endif /* INET */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #define UH_WLOCKED 2 #define UH_RLOCKED 1 #define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; + struct epoch_tracker et; int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo, unlock_inp; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; sin = (struct sockaddr_in *)addr; if (sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_WLOCK(inp); unlock_inp = UH_WLOCKED; } else { INP_RLOCK(inp); unlock_inp = UH_RLOCKED; } tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. * * XXXRW: Check that hash locking update here is correct. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - INP_HASH_RLOCK(pcbinfo); + INP_HASH_RLOCK_ET(pcbinfo, et); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); #ifdef RSS } else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } #endif } #ifdef RSS /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK(pcbinfo); + INP_HASH_RUNLOCK_ET(pcbinfo, et); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == UH_WLOCKED) { KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, shared inp lock", __func__)); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { KASSERT(unlock_inp == UH_RLOCKED, ("%s: shared udbinfo lock, excl inp lock", __func__)); - INP_HASH_RUNLOCK(pcbinfo); + INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); } else if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); m_freem(m); return (error); } static void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { static uint32_t udp_flowid; struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1); inp->inp_flowtype = M_HASHTYPE_OPAQUE; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if ((up->u_tun_func != NULL) || (up->u_icmp_func != NULL)) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_icmp_func = i; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; #endif /* INET */ Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 335923) +++ head/sys/netinet6/icmp6.c (revision 335924) @@ -1,2818 +1,2819 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); static VNET_DEFINE(int, icmp6errpps_count) = 0; static VNET_DEFINE(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) #define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); ICMP6STAT_INC(icp6s_outhist[type]); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int ip6len, error; ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_ADMIN_PROHIB; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(m, off, icmp6len) != 0) return (IPPROTO_DONE); /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { struct prison *pr; u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { IP6_EXTHDR_CHECK(m, off, icmp6len, IPPROTO_DONE); error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; struct prison *pr; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { ifa_free(&ia6->ia_ifa); nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } ifa_free(&ia6->ia_ifa); } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK_NOSLEEP(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } IF_ADDR_RUNLOCK(ifp); if (iffound) { *ifpp = ifp; IFNET_RUNLOCK_NOSLEEP(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK_NOSLEEP(); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK_NOSLEEP(); ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { IF_ADDR_RUNLOCK(ifp); /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } IF_ADDR_RUNLOCK(ifp); if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *in6p; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; + struct epoch_tracker et; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); continue; } if (last != NULL) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked( &last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); INP_RUNLOCK(last); } else { m_freem(m); IP6STAT_DEC(ip6s_delivered); } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } if (ia != NULL) ifa_free(&ia->ia_ifa); } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { mld_fasttimo(); } void icmp6_slowtimo(void) { mld_slowtimo(); } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); ifp = m->m_pkthdr.rcvif; /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); if (fib6_lookup_nh_basic(ifp->if_fib, &kdst, scopeid, 0, 0,&nh6)==0){ if ((nh6.nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* * Embed scope zone id into next hop address, since * fib6_lookup_nh_basic() returns address without embedded * scope zone id. */ if (in6_setscope(&nh6.nh_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh6.nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = RTF_HOST; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) in6_rtredirect((struct sockaddr *)&sdst, gw, (struct sockaddr *)NULL, rt_flags, (struct sockaddr *)&ssrc, fibnum); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, rt->rt_fibnum); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(router_ll6, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6_gif.c =================================================================== --- head/sys/netinet6/in6_gif.c (revision 335923) +++ head/sys/netinet6/in6_gif.c (revision 335924) @@ -1,427 +1,428 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #include #include #include #include #define GIF_HLIM 30 static VNET_DEFINE(int, ip6_gif_hlim) = GIF_HLIM; #define V_ip6_gif_hlim VNET(ip6_gif_hlim) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gif_hlim), 0, "Default hop limit for encapsulated packets"); /* * We keep interfaces in a hash table using src+dst as key. * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list. */ static VNET_DEFINE(struct gif_list *, ipv6_hashtbl) = NULL; static VNET_DEFINE(struct gif_list, ipv6_list) = CK_LIST_HEAD_INITIALIZER(); #define V_ipv6_hashtbl VNET(ipv6_hashtbl) #define V_ipv6_list VNET(ipv6_list) #define GIF_HASH(src, dst) (V_ipv6_hashtbl[\ in6_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)]) #define GIF_HASH_SC(sc) GIF_HASH(&(sc)->gif_ip6hdr->ip6_src,\ &(sc)->gif_ip6hdr->ip6_dst) static uint32_t in6_gif_hashval(const struct in6_addr *src, const struct in6_addr *dst) { uint32_t ret; ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT); return (fnv_32_buf(dst, sizeof(*dst), ret)); } static int in6_gif_checkdup(const struct gif_softc *sc, const struct in6_addr *src, const struct in6_addr *dst) { struct gif_softc *tmp; if (sc->gif_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, src) && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, dst)) return (EEXIST); CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) { if (tmp == sc) continue; if (IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_src, src) && IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_dst, dst)) return (EADDRNOTAVAIL); } return (0); } static void in6_gif_attach(struct gif_softc *sc) { if (sc->gif_options & GIF_IGNORE_SOURCE) CK_LIST_INSERT_HEAD(&V_ipv6_list, sc, chain); else CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain); } int in6_gif_setopts(struct gif_softc *sc, u_int options) { /* NOTE: we are protected with gif_ioctl_sx lock */ MPASS(sc->gif_family == AF_INET6); MPASS(sc->gif_options != options); if ((options & GIF_IGNORE_SOURCE) != (sc->gif_options & GIF_IGNORE_SOURCE)) { CK_LIST_REMOVE(sc, chain); sc->gif_options = options; in6_gif_attach(sc); } return (0); } int in6_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct sockaddr_in6 *dst, *src; struct ip6_hdr *ip6; int error; /* NOTE: we are protected with gif_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR_IN6: src = &((struct in6_aliasreq *)data)->ifra_addr; dst = &((struct in6_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin6_family != dst->sin6_family || src->sin6_family != AF_INET6 || src->sin6_len != dst->sin6_len || src->sin6_len != sizeof(*src)) break; if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) { error = EADDRNOTAVAIL; break; } /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ if ((error = sa6_embedscope(src, 0)) != 0 || (error = sa6_embedscope(dst, 0)) != 0) break; if (V_ipv6_hashtbl == NULL) V_ipv6_hashtbl = gif_hashinit(); error = in6_gif_checkdup(sc, &src->sin6_addr, &dst->sin6_addr); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip6 = malloc(sizeof(*ip6), M_GIF, M_WAITOK | M_ZERO); ip6->ip6_src = src->sin6_addr; ip6->ip6_dst = dst->sin6_addr; if (sc->gif_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); GIF_WAIT(); free(sc->gif_hdr, M_GIF); /* XXX: should we notify about link state change? */ } sc->gif_family = AF_INET6; sc->gif_ip6hdr = ip6; in6_gif_attach(sc); break; case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gif_family != AF_INET6) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in6 *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin6_family = AF_INET6; src->sin6_len = sizeof(*src); src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ? sc->gif_ip6hdr->ip6_src: sc->gif_ip6hdr->ip6_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error == 0) error = sa6_recoverscope(src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; int len; /* prepend new IP header */ - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); len = sizeof(struct ip6_hdr); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in6_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip6 = mtod(m, struct ip6_hdr *); MPASS(sc->gif_family == AF_INET6); bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr)); ip6->ip6_flow |= htonl((uint32_t)ecn << 20); ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; /* * force fragmentation to minimum MTU, to avoid path MTU discovery. * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL)); } static int in6_gif_input(struct mbuf *m, int off, int proto, void *arg) { struct gif_softc *sc = arg; struct ifnet *gifp; struct ip6_hdr *ip6; uint8_t ecn; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if (sc == NULL) { m_freem(m); IP6STAT_INC(ip6s_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip6 = mtod(m, struct ip6_hdr *); ecn = (ntohl(ip6->ip6_flow) >> 20) & 0xff; m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); IP6STAT_INC(ip6s_nogif); } return (IPPROTO_DONE); } static int in6_gif_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip6_hdr *ip6; struct gif_softc *sc; int ret; if (V_ipv6_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); /* * NOTE: it is safe to iterate without any locking here, because softc * can be reclaimed only when we are not within net_epoch_preempt * section, but ip_encap lookup+input are executed in epoch section. */ ip6 = mtod(m, const struct ip6_hdr *); ret = 0; CK_LIST_FOREACH(sc, &GIF_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) { /* * This is an inbound packet, its ip6_dst is source address * in softc. */ if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst) && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, &ip6->ip6_src)) { ret = ENCAP_DRV_LOOKUP; goto done; } } /* * No exact match. * Check the list of interfaces with GIF_IGNORE_SOURCE flag. */ CK_LIST_FOREACH(sc, &V_ipv6_list, chain) { if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst)) { ret = 128 + 8; /* src + proto */ goto done; } } return (0); done: if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { struct nhop6_basic nh6; if (fib6_lookup_nh_basic(sc->gif_fibnum, &ip6->ip6_src, ntohs(in6_getscope(&ip6->ip6_src)), 0, 0, &nh6) != 0) return (0); if (nh6.nh_ifp != m->m_pkthdr.rcvif) return (0); } *arg = sc; return (ret); } static struct { const struct encap_config encap; const struct encaptab *cookie; } ipv6_encap_cfg[] = { #ifdef INET { .encap = { .proto = IPPROTO_IPV4, .min_length = sizeof(struct ip6_hdr) + sizeof(struct ip), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, }, #endif { .encap = { .proto = IPPROTO_IPV6, .min_length = 2 * sizeof(struct ip6_hdr), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, }, { .encap = { .proto = IPPROTO_ETHERIP, .min_length = sizeof(struct ip6_hdr) + sizeof(struct etherip_header) + sizeof(struct ether_header), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, } }; void in6_gif_init(void) { int i; if (!IS_DEFAULT_VNET(curvnet)) return; for (i = 0; i < nitems(ipv6_encap_cfg); i++) ipv6_encap_cfg[i].cookie = ip6_encap_attach( &ipv6_encap_cfg[i].encap, NULL, M_WAITOK); } void in6_gif_uninit(void) { int i; if (IS_DEFAULT_VNET(curvnet)) { for (i = 0; i < nitems(ipv6_encap_cfg); i++) ip6_encap_detach(ipv6_encap_cfg[i].cookie); } if (V_ipv6_hashtbl != NULL) gif_hashdestroy(V_ipv6_hashtbl); } Index: head/sys/netinet6/ip6_gre.c =================================================================== --- head/sys/netinet6/ip6_gre.c (revision 335923) +++ head/sys/netinet6/ip6_gre.c (revision 335924) @@ -1,287 +1,288 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #include #include VNET_DEFINE(int, ip6_gre_hlim) = IPV6_DEFHLIM; #define V_ip6_gre_hlim VNET(ip6_gre_hlim) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, OID_AUTO, grehlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gre_hlim), 0, "Default hop limit for encapsulated packets"); static VNET_DEFINE(struct gre_list *, ipv6_hashtbl) = NULL; #define V_ipv6_hashtbl VNET(ipv6_hashtbl) #define GRE_HASH(src, dst) (V_ipv6_hashtbl[\ in6_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH(&(sc)->gre_oip6.ip6_src,\ &(sc)->gre_oip6.ip6_dst) static uint32_t in6_gre_hashval(const struct in6_addr *src, const struct in6_addr *dst) { uint32_t ret; ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT); return (fnv_32_buf(dst, sizeof(*dst), ret)); } static int in6_gre_checkdup(const struct gre_softc *sc, const struct in6_addr *src, const struct in6_addr *dst) { struct gre_softc *tmp; if (sc->gre_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, src) && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, dst)) return (EEXIST); CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { if (tmp == sc) continue; if (IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_src, src) && IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_dst, dst)) return (EADDRNOTAVAIL); } return (0); } static int in6_gre_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip6_hdr *ip6; struct gre_softc *sc; if (V_ipv6_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip6 = mtod(m, const struct ip6_hdr *); CK_LIST_FOREACH(sc, &GRE_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) { /* * This is an inbound packet, its ip6_dst is source address * in softc. */ if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, &ip6->ip6_dst) && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &ip6->ip6_src)) { if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } static void in6_gre_attach(struct gre_softc *sc) { sc->gre_hlen = sizeof(struct greip6); sc->gre_oip6.ip6_vfc = IPV6_VERSION; sc->gre_oip6.ip6_nxt = IPPROTO_GRE; gre_updatehdr(sc, &sc->gre_gi6hdr->gi6_gre); CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); } void in6_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { MPASS(cmd == GRESKEY || cmd == GRESOPTS); /* NOTE: we are protected with gre_ioctl_sx lock */ MPASS(sc->gre_family == AF_INET6); CK_LIST_REMOVE(sc, chain); GRE_WAIT(); if (cmd == GRESKEY) sc->gre_key = value; else sc->gre_options = value; in6_gre_attach(sc); } int in6_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct sockaddr_in6 *dst, *src; struct ip6_hdr *ip6; int error; /* NOTE: we are protected with gre_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR_IN6: src = &((struct in6_aliasreq *)data)->ifra_addr; dst = &((struct in6_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin6_family != dst->sin6_family || src->sin6_family != AF_INET6 || src->sin6_len != dst->sin6_len || src->sin6_len != sizeof(*src)) break; if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) { error = EADDRNOTAVAIL; break; } /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ if ((error = sa6_embedscope(src, 0)) != 0 || (error = sa6_embedscope(dst, 0)) != 0) break; if (V_ipv6_hashtbl == NULL) V_ipv6_hashtbl = gre_hashinit(); error = in6_gre_checkdup(sc, &src->sin6_addr, &dst->sin6_addr); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip6 = malloc(sizeof(struct greip6) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip6->ip6_src = src->sin6_addr; ip6->ip6_dst = dst->sin6_addr; if (sc->gre_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); GRE_WAIT(); free(sc->gre_hdr, M_GRE); /* XXX: should we notify about link state change? */ } sc->gre_family = AF_INET6; sc->gre_hdr = ip6; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; in6_gre_attach(sc); break; case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gre_family != AF_INET6) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in6 *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin6_family = AF_INET6; src->sin6_len = sizeof(*src); src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ? sc->gre_oip6.ip6_src: sc->gre_oip6.ip6_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error == 0) error = sa6_recoverscope(src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in6_gre_output(struct mbuf *m, int af __unused, int hlen __unused) { struct greip6 *gi6; gi6 = mtod(m, struct greip6 *); gi6->gi6_ip6.ip6_hlim = V_ip6_gre_hlim; return (ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL, NULL)); } static const struct encaptab *ecookie = NULL; static const struct encap_config ipv6_encap_cfg = { .proto = IPPROTO_GRE, .min_length = sizeof(struct greip6) + #ifdef INET sizeof(struct ip), #else sizeof(struct ip6_hdr), #endif .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gre_lookup, .input = gre_input }; void in6_gre_init(void) { if (!IS_DEFAULT_VNET(curvnet)) return; ecookie = ip6_encap_attach(&ipv6_encap_cfg, NULL, M_WAITOK); } void in6_gre_uninit(void) { if (IS_DEFAULT_VNET(curvnet)) ip6_encap_detach(ecookie); if (V_ipv6_hashtbl != NULL) gre_hashdestroy(V_ipv6_hashtbl); } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 335923) +++ head/sys/netinet6/raw_ip6.c (revision 335924) @@ -1,899 +1,900 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; extern u_long rip_recvspace; VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat); VNET_PCPUSTAT_SYSINIT(rip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rip6stat); #endif /* VIMAGE */ /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip6_mrouter); /* * The various mrouter functions. */ int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *in6p; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; + struct epoch_tracker et; RIP6STAT_INC(rip6s_ipackets); init_sin6(&fromsa, m, 0); /* general init */ ifp = m->m_pkthdr.rcvif; - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { /* XXX inp locking */ if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p && in6p->inp_ip_p != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; if (jailed_without_vnet(in6p->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && prison_check_ip6(in6p->inp_cred, &ip6->ip6_dst) != 0) continue; } INP_RLOCK(in6p); if (in6p->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { INP_RUNLOCK(in6p); RIP6STAT_INC(rip6s_badsum); continue; } } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (in6p->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it * through unconditionally to the raw socket. * * Use the M_RTALERT_MLD flag to check for MLD * traffic without having to inspect the mbuf chain * more deeply, as all MLDv1/v2 host messages MUST * contain the Router Alert option. * * In the case of MLDv1, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. im6o_mc_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if ((m->m_flags & M_RTALERT_MLD) == 0) { struct sockaddr_in6 mcaddr; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(in6p->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); INP_RUNLOCK(in6p); continue; } } if (last != NULL) { struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (n != NULL && IPSEC_CHECK_POLICY(ipv6, n, last) != 0) { m_freem(n); /* Do not inject data into pcb. */ n = NULL; } } #endif /* IPSEC */ if (n) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6) && last != NULL && IPSEC_CHECK_POLICY(ipv6, m, last) != 0) { m_freem(m); IP6STAT_DEC(ip6s_delivered); /* Do not inject data into pcb. */ INP_RUNLOCK(last); } else #endif /* IPSEC */ if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* Strip intermediate headers. */ m_adj(m, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); INP_RUNLOCK(last); } else { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); if (proto == IPPROTO_NONE) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); } return (IPPROTO_DONE); } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* * If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ int rip6_output(struct mbuf *m, struct socket *so, ...) { struct mbuf *control; struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int use_defzone = 0; int hlim = 0; struct in6_addr in6a; va_list ap; va_start(ap, so); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoinpcb(so); INP_WLOCK(in6p); if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (!optp || !optp->ip6po_pktinfo || !optp->ip6po_pktinfo->ipi6_ifindex) use_defzone = V_ip6_use_defzone; if (dstsock->sin6_scope_id == 0 && !use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; error = prison_check_ip6(in6p->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } /* * Send RA/RS messages to user land for protection, before sending * them to rtadvd/rtsol. */ if ((send_sendso_input_hook != NULL) && so->so_proto->pr_protocol == IPPROTO_ICMPV6) { switch (type) { case ND_ROUTER_ADVERT: case ND_ROUTER_SOLICIT: mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m, mtag); } } error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); ICMP6STAT_INC(icp6s_outhist[type]); } else RIP6STAT_INC(rip6s_opackets); goto freectl; bad: if (m) m_freem(m); freectl: if (control != NULL) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(in6p); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { inp = sotoinpcb(so); INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0) return (error); if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); NET_EPOCH_ENTER(); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) { NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); } NET_EPOCH_EXIT(); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ error = in6_selectsrc_socket(addr, inp->in6p_outputopts, inp, so->so_cred, scope_ambiguous, &in6a, NULL); if (error) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = in6a; soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } if (nam->sa_len != sizeof(struct sockaddr_in6)) { m_freem(m); return (EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c (revision 335923) +++ head/sys/netinet6/udp6_usrreq.c (revision 335924) @@ -1,1329 +1,1330 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct protosw inetsw[]; static void udp6_detach(struct socket *so); static int udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts = NULL, *tmp_opts; struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&fromsa[0], up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) { m_freem(n); return (0); } } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmp_opts = sbcreatecontrol((caddr_t)&fromsa[1], sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR, IPPROTO_IPV6); if (tmp_opts) { if (opts) { tmp_opts->m_next = opts; opts = tmp_opts; } else opts = tmp_opts; } } m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *ifp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; + struct epoch_tracker et; struct sockaddr_in6 fromsa[2]; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); if (!uh) return (IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); #endif UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = proto; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } /* * Construct sockaddr format source address. */ init_sin6(&fromsa[0], m, 0); fromsa[0].sin6_port = uh->uh_sport; init_sin6(&fromsa[1], m, 1); fromsa[1].sin6_port = uh->uh_dport; pcbinfo = udp_get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip6_moptions *imo; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_RLOCK_ET(pcbinfo, et); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ pcblist = udp_get_pcblist(nxt); last = NULL; CK_LIST_FOREACH(inp, pcblist, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_lport != uh->uh_dport) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) || inp->inp_fport != uh->uh_sport) continue; } /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->in6p_moptions; if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct sockaddr_in6 mcaddr; int blocked; INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); continue; } bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(imo, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); /* XXX */ continue; } INP_RUNLOCK(inp); } if (last != NULL) { struct mbuf *n; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { INP_RLOCK(last); if (__predict_true(inp->inp_flags2 & INP_FREED) == 0) { UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, n, off, fromsa)) goto inp_lost; } INP_RUNLOCK(last); } } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); goto badheadlocked; } INP_RLOCK(last); if (__predict_true(inp->inp_flags2 & INP_FREED) == 0) { UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, m, off, fromsa) == 0) INP_RUNLOCK(last); } else INP_RUNLOCK(last); - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); inp_lost: return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, fromsa) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badheadlocked: - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); badunlocked: if (m) m_freem(m); return (IPPROTO_DONE); } static void udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } if (ip6) { /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); if (!PRC_IS_REDIRECT(cmd)) { /* Check to see if its tunneled */ struct inpcb *inp; inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst, uh.uh_dport, &ip6->ip6_src, uh.uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp != NULL) { struct udpcb *up; up = intoudpcb(inp); if (up->u_icmp_func) { /* Yes it is. */ INP_RUNLOCK(inp); (*up->u_icmp_func)(cmd, (struct sockaddr *)ip6cp->ip6c_src, d, up->u_tun_ctx); return; } else { /* Can't find it. */ INP_RUNLOCK(inp); } } } (void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else (void)in6_pcbnotify(pcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo)); } void udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct sockaddr_in6 *sin6 = NULL; int cscov_partial = 0; int scope_ambiguous = 0; u_short fport; int error = 0; uint8_t nxt; uint16_t cscov = 0; struct ip6_pktopts *optp, opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (addr6) { /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return (error); } nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) goto release; optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { faddr = &sin6->sin6_addr; /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { error = in6_selectsrc_socket(sin6, optp, inp, td->td_ucred, scope_ambiguous, &in6a, NULL); if (error) goto release; laddr = &in6a; } else laddr = &inp->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->inp_lport == 0 && (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } #ifdef RSS { uint32_t hash_val, hash_type; uint8_t pr; pr = inp->inp_socket->so_proto->pr_protocol; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v6(faddr, laddr, fport, inp->inp_lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } } #endif flags = 0; #ifdef RSS /* * Don't override with the inp cached flowid. * * Until the whole UDP path is vetted, it may actually * be incorrect. */ flags |= IP_NODEFAULTFLOWID; #endif UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, &inp->inp_route6, flags, inp->in6p_moptions, NULL, inp); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_abort)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_disconnect)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); goto out; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (void)(*pru->pru_disconnect)(so); return (0); } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error = 0; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = NULL; if (addr == NULL) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[nxt]].pr_usrreqs; /* addr will just be freed in sendit(). */ return ((*pru->pru_send)(so, flags, m, addr, control, td)); } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif INP_HASH_WLOCK(pcbinfo); error = udp6_output(inp, m, addr, control, td); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); bad: INP_WUNLOCK(inp); m_freem(m); return (error); } struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, .pru_bind = udp6_bind, .pru_connect = udp6_connect, .pru_control = in6_control, .pru_detach = udp6_detach, .pru_disconnect = udp6_disconnect, .pru_peeraddr = in6_mapped_peeraddr, .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close }; Index: head/sys/sys/epoch.h =================================================================== --- head/sys/sys/epoch.h (revision 335923) +++ head/sys/sys/epoch.h (revision 335924) @@ -1,93 +1,93 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EPOCH_H_ #define _SYS_EPOCH_H_ #ifdef _KERNEL #include -#include +#include #endif -struct thread; struct epoch; typedef struct epoch *epoch_t; #define EPOCH_PREEMPT 0x1 #define EPOCH_LOCKED 0x2 extern epoch_t global_epoch; extern epoch_t global_epoch_preempt; struct epoch_context { void *data[2]; -} __aligned(sizeof(void *)); +} __aligned(sizeof(void *)); typedef struct epoch_context *epoch_context_t; + +struct epoch_tracker { + void *datap[3]; +#ifdef INVARIANTS + int datai[5]; +#else + int datai[1]; +#endif +} __aligned(sizeof(void *)); + +typedef struct epoch_tracker *epoch_tracker_t; + epoch_t epoch_alloc(int flags); void epoch_free(epoch_t epoch); -void epoch_enter(epoch_t epoch); -void epoch_enter_preempt_internal(epoch_t epoch, struct thread *td); -void epoch_exit(epoch_t epoch); -void epoch_exit_preempt_internal(epoch_t epoch, struct thread *td); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)); -int in_epoch(void); - +int in_epoch(epoch_t epoch); +int in_epoch_verbose(epoch_t epoch, int dump_onfail); #ifdef _KERNEL DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); +#define EPOCH_MAGIC0 0xFADECAFEF00DD00D +#define EPOCH_MAGIC1 0xBADDBABEDEEDFEED -static __inline void -epoch_enter_preempt(epoch_t epoch) -{ - struct thread *td; - int nesting __unused; +void epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et); +void epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et); +void epoch_enter_KBI(epoch_t epoch); +void epoch_exit_KBI(epoch_t epoch); - td = curthread; - nesting = td->td_epochnest++; -#ifndef INVARIANTS - if (nesting == 0) -#endif - epoch_enter_preempt_internal(epoch, td); -} -static __inline void -epoch_exit_preempt(epoch_t epoch) -{ - struct thread *td; +#if defined(KLD_MODULE) && !defined(KLD_TIED) +#define epoch_enter_preempt(e, t) epoch_enter_preempt_KBI((e), (t)) +#define epoch_exit_preempt(e, t) epoch_exit_preempt_KBI((e), (t)) +#define epoch_enter(e) epoch_enter_KBI((e)) +#define epoch_exit(e) epoch_exit_KBI((e)) +#else +#include +#endif /* KLD_MODULE */ - td = curthread; - MPASS(td->td_epochnest); - if (td->td_epochnest-- == 1) - epoch_exit_preempt_internal(epoch, td); -} -#endif /* _KERNEL */ +#endif /* _KERNEL */ #endif Index: head/sys/sys/epoch_private.h =================================================================== --- head/sys/sys/epoch_private.h (nonexistent) +++ head/sys/sys/epoch_private.h (revision 335924) @@ -0,0 +1,203 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_EPOCH_PRIVATE_H_ +#define _SYS_EPOCH_PRIVATE_H_ +#ifndef _KERNEL +#error "no user serviceable parts" +#else +#include +#include + +#include + +extern void epoch_adjust_prio(struct thread *td, u_char prio); +#ifndef _SYS_SYSTM_H_ +extern void critical_exit_preempt(void); +#endif + +#ifdef __amd64__ +#define EPOCH_ALIGN CACHE_LINE_SIZE*2 +#else +#define EPOCH_ALIGN CACHE_LINE_SIZE +#endif + +/* + * Standalone (_sa) routines for thread state manipulation + */ +static __inline void +critical_enter_sa(void *tdarg) +{ + struct thread_lite *td; + + td = tdarg; + td->td_critnest++; + __compiler_membar(); +} + +static __inline void +critical_exit_sa(void *tdarg) +{ + struct thread_lite *td; + + td = tdarg; + MPASS(td->td_critnest > 0); + __compiler_membar(); + td->td_critnest--; + __compiler_membar(); + if (__predict_false(td->td_owepreempt != 0)) + critical_exit_preempt(); +} + +typedef struct epoch_thread { +#ifdef INVARIANTS + uint64_t et_magic_pre; +#endif + TAILQ_ENTRY(epoch_thread) et_link; /* Epoch queue. */ + struct thread *et_td; /* pointer to thread in section */ + ck_epoch_section_t et_section; /* epoch section object */ +#ifdef INVARIANTS + uint64_t et_magic_post; +#endif +} *epoch_thread_t; +TAILQ_HEAD (epoch_tdlist, epoch_thread); + +typedef struct epoch_record { + ck_epoch_record_t er_record; + volatile struct epoch_tdlist er_tdlist; + volatile uint32_t er_gen; + uint32_t er_cpuid; +} __aligned(EPOCH_ALIGN) *epoch_record_t; + +struct epoch { + struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); + struct epoch_record *e_pcpu_dom[MAXMEMDOM] __aligned(EPOCH_ALIGN); + int e_idx; + int e_flags; + struct epoch_record *e_pcpu[0]; +}; + +#define INIT_CHECK(epoch) \ + do { \ + if (__predict_false((epoch) == NULL)) \ + return; \ + } while (0) + +static __inline void +epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et) +{ + struct epoch_record *er; + struct epoch_thread *etd; + struct thread_lite *td; + MPASS(cold || epoch != NULL); + INIT_CHECK(epoch); + etd = (void *)et; +#ifdef INVARIANTS + MPASS(epoch->e_flags & EPOCH_PREEMPT); + etd->et_magic_pre = EPOCH_MAGIC0; + etd->et_magic_post = EPOCH_MAGIC1; +#endif + td = (struct thread_lite *)curthread; + etd->et_td = (void*)td; + td->td_epochnest++; + critical_enter_sa(td); + sched_pin_lite(td); + + td->td_pre_epoch_prio = td->td_priority; + er = epoch->e_pcpu[curcpu]; + TAILQ_INSERT_TAIL(&er->er_tdlist, etd, et_link); + ck_epoch_begin(&er->er_record, (ck_epoch_section_t *)&etd->et_section); + critical_exit_sa(td); +} + +static __inline void +epoch_enter(epoch_t epoch) +{ + ck_epoch_record_t *record; + struct thread_lite *td; + MPASS(cold || epoch != NULL); + INIT_CHECK(epoch); + td = (struct thread_lite *)curthread; + + td->td_epochnest++; + critical_enter_sa(td); + record = &epoch->e_pcpu[curcpu]->er_record; + ck_epoch_begin(record, NULL); +} + +static __inline void +epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et) +{ + struct epoch_record *er; + struct epoch_thread *etd; + struct thread_lite *td; + + INIT_CHECK(epoch); + td = (struct thread_lite *)curthread; + critical_enter_sa(td); + sched_unpin_lite(td); + MPASS(td->td_epochnest); + td->td_epochnest--; + er = epoch->e_pcpu[curcpu]; + MPASS(epoch->e_flags & EPOCH_PREEMPT); + etd = (void *)et; +#ifdef INVARIANTS + MPASS(etd != NULL); + MPASS(etd->et_td == (struct thread *)td); + MPASS(etd->et_magic_pre == EPOCH_MAGIC0); + MPASS(etd->et_magic_post == EPOCH_MAGIC1); + etd->et_magic_pre = 0; + etd->et_magic_post = 0; + etd->et_td = (void*)0xDEADBEEF; +#endif + ck_epoch_end(&er->er_record, + (ck_epoch_section_t *)&etd->et_section); + TAILQ_REMOVE(&er->er_tdlist, etd, et_link); + er->er_gen++; + if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) + epoch_adjust_prio((struct thread *)td, td->td_pre_epoch_prio); + critical_exit_sa(td); +} + +static __inline void +epoch_exit(epoch_t epoch) +{ + ck_epoch_record_t *record; + struct thread_lite *td; + + INIT_CHECK(epoch); + td = (struct thread_lite *)curthread; + MPASS(td->td_epochnest); + td->td_epochnest--; + record = &epoch->e_pcpu[curcpu]->er_record; + ck_epoch_end(record, NULL); + critical_exit_sa(td); +} +#endif /* _KERNEL */ +#endif /* _SYS_EPOCH_PRIVATE_H_ */ Property changes on: head/sys/sys/epoch_private.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/sys/pmckern.h =================================================================== --- head/sys/sys/pmckern.h (revision 335923) +++ head/sys/sys/pmckern.h (revision 335924) @@ -1,269 +1,270 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2007, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * PMC interface used by the base kernel. */ #ifndef _SYS_PMCKERN_H_ #define _SYS_PMCKERN_H_ #include #include #include #include #include #include #include #include #define PMC_FN_PROCESS_EXEC 1 #define PMC_FN_CSW_IN 2 #define PMC_FN_CSW_OUT 3 #define PMC_FN_DO_SAMPLES 4 #define PMC_FN_UNUSED1 5 #define PMC_FN_UNUSED2 6 #define PMC_FN_MMAP 7 #define PMC_FN_MUNMAP 8 #define PMC_FN_USER_CALLCHAIN 9 #define PMC_FN_USER_CALLCHAIN_SOFT 10 #define PMC_FN_SOFT_SAMPLING 11 #define PMC_FN_THR_CREATE 12 #define PMC_FN_THR_EXIT 13 #define PMC_FN_THR_USERRET 14 #define PMC_FN_THR_CREATE_LOG 15 #define PMC_FN_THR_EXIT_LOG 16 #define PMC_FN_PROC_CREATE_LOG 17 #define PMC_HR 0 /* Hardware ring buffer */ #define PMC_SR 1 /* Software ring buffer */ #define PMC_UR 2 /* userret ring buffer */ #define PMC_NUM_SR (PMC_UR+1) struct pmckern_procexec { int pm_credentialschanged; uintfptr_t pm_entryaddr; }; struct pmckern_map_in { void *pm_file; /* filename or vnode pointer */ uintfptr_t pm_address; /* address object is loaded at */ }; struct pmckern_map_out { uintfptr_t pm_address; /* start address of region */ size_t pm_size; /* size of unmapped region */ }; struct pmckern_soft { enum pmc_event pm_ev; int pm_cpu; struct trapframe *pm_tf; }; /* * Soft PMC. */ #define PMC_SOFT_DEFINE_EX(prov, mod, func, name, alloc, release) \ struct pmc_soft pmc_##prov##_##mod##_##func##_##name = \ { 0, alloc, release, { #prov "_" #mod "_" #func "." #name, 0 } }; \ SYSINIT(pmc_##prov##_##mod##_##func##_##name##_init, SI_SUB_KDTRACE, \ SI_ORDER_SECOND + 1, pmc_soft_ev_register, \ &pmc_##prov##_##mod##_##func##_##name ); \ SYSUNINIT(pmc_##prov##_##mod##_##func##_##name##_uninit, \ SI_SUB_KDTRACE, SI_ORDER_SECOND + 1, pmc_soft_ev_deregister, \ &pmc_##prov##_##mod##_##func##_##name ) #define PMC_SOFT_DEFINE(prov, mod, func, name) \ PMC_SOFT_DEFINE_EX(prov, mod, func, name, NULL, NULL) #define PMC_SOFT_DECLARE(prov, mod, func, name) \ extern struct pmc_soft pmc_##prov##_##mod##_##func##_##name /* * PMC_SOFT_CALL can be used anywhere in the kernel. * Require md defined PMC_FAKE_TRAPFRAME. */ #ifdef PMC_FAKE_TRAPFRAME #define PMC_SOFT_CALL(pr, mo, fu, na) \ do { \ if (__predict_false(pmc_##pr##_##mo##_##fu##_##na.ps_running)) { \ struct pmckern_soft ks; \ register_t intr; \ intr = intr_disable(); \ PMC_FAKE_TRAPFRAME(&pmc_tf[curcpu]); \ ks.pm_ev = pmc_##pr##_##mo##_##fu##_##na.ps_ev.pm_ev_code; \ ks.pm_cpu = PCPU_GET(cpuid); \ ks.pm_tf = &pmc_tf[curcpu]; \ PMC_CALL_HOOK_UNLOCKED(curthread, \ PMC_FN_SOFT_SAMPLING, (void *) &ks); \ intr_restore(intr); \ } \ } while (0) #else #define PMC_SOFT_CALL(pr, mo, fu, na) \ do { \ } while (0) #endif /* * PMC_SOFT_CALL_TF need to be used carefully. * Userland capture will be done during AST processing. */ #define PMC_SOFT_CALL_TF(pr, mo, fu, na, tf) \ do { \ if (__predict_false(pmc_##pr##_##mo##_##fu##_##na.ps_running)) { \ struct pmckern_soft ks; \ register_t intr; \ intr = intr_disable(); \ ks.pm_ev = pmc_##pr##_##mo##_##fu##_##na.ps_ev.pm_ev_code; \ ks.pm_cpu = PCPU_GET(cpuid); \ ks.pm_tf = tf; \ PMC_CALL_HOOK_UNLOCKED(curthread, \ PMC_FN_SOFT_SAMPLING, (void *) &ks); \ intr_restore(intr); \ } \ } while (0) struct pmc_soft { int ps_running; void (*ps_alloc)(void); void (*ps_release)(void); struct pmc_dyn_event_descr ps_ev; }; struct pmclog_buffer; struct pmc_domain_buffer_header { struct mtx pdbh_mtx; TAILQ_HEAD(, pmclog_buffer) pdbh_head; struct pmclog_buffer *pdbh_plbs; int pdbh_ncpus; } __aligned(CACHE_LINE_SIZE); /* hook */ extern int (*pmc_hook)(struct thread *_td, int _function, void *_arg); extern int (*pmc_intr)(struct trapframe *_frame); /* SX lock protecting the hook */ extern struct sx pmc_sx; /* Per-cpu flags indicating availability of sampling data */ DPCPU_DECLARE(uint8_t, pmc_sampled); /* Count of system-wide sampling PMCs in existence */ extern volatile int pmc_ss_count; /* kernel version number */ extern const int pmc_kernel_version; /* PMC soft per cpu trapframe */ extern struct trapframe pmc_tf[MAXCPU]; /* per domain buffer header list */ extern struct pmc_domain_buffer_header *pmc_dom_hdrs[MAXMEMDOM]; /* Quick check if preparatory work is necessary */ #define PMC_HOOK_INSTALLED(cmd) __predict_false(pmc_hook != NULL) /* Hook invocation; for use within the kernel */ #define PMC_CALL_HOOK(t, cmd, arg) \ -do { \ - epoch_enter_preempt(global_epoch_preempt); \ +do { \ + struct epoch_tracker et; \ + epoch_enter_preempt(global_epoch_preempt, &et); \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ - epoch_exit_preempt(global_epoch_preempt); \ + epoch_exit_preempt(global_epoch_preempt, &et); \ } while (0) /* Hook invocation that needs an exclusive lock */ #define PMC_CALL_HOOK_X(t, cmd, arg) \ do { \ sx_xlock(&pmc_sx); \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ sx_xunlock(&pmc_sx); \ } while (0) /* * Some hook invocations (e.g., from context switch and clock handling * code) need to be lock-free. */ #define PMC_CALL_HOOK_UNLOCKED(t, cmd, arg) \ do { \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ } while (0) #define PMC_SWITCH_CONTEXT(t,cmd) PMC_CALL_HOOK_UNLOCKED(t,cmd,NULL) /* Check if a process is using HWPMCs.*/ #define PMC_PROC_IS_USING_PMCS(p) \ (__predict_false(p->p_flag & P_HWPMC)) #define PMC_THREAD_HAS_SAMPLES(td) \ (__predict_false((td)->td_pmcpend)) /* Check if a thread have pending user capture. */ #define PMC_IS_PENDING_CALLCHAIN(p) \ (__predict_false((p)->td_pflags & TDP_CALLCHAIN)) #define PMC_SYSTEM_SAMPLING_ACTIVE() (pmc_ss_count > 0) /* Check if a CPU has recorded samples. */ #define PMC_CPU_HAS_SAMPLES(C) (__predict_false(DPCPU_ID_GET((C), pmc_sampled))) /* * Helper functions. */ int pmc_cpu_is_disabled(int _cpu); /* deprecated */ int pmc_cpu_is_active(int _cpu); int pmc_cpu_is_present(int _cpu); int pmc_cpu_is_primary(int _cpu); unsigned int pmc_cpu_max(void); #ifdef INVARIANTS int pmc_cpu_max_active(void); #endif /* * Soft events functions. */ void pmc_soft_ev_register(struct pmc_soft *ps); void pmc_soft_ev_deregister(struct pmc_soft *ps); struct pmc_soft *pmc_soft_ev_acquire(enum pmc_event ev); void pmc_soft_ev_release(struct pmc_soft *ps); #endif /* _SYS_PMCKERN_H_ */ Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 335923) +++ head/sys/sys/proc.h (revision 335924) @@ -1,1177 +1,1162 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #include #endif #include #include #include #include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL #include #endif - /* - * A section object may be passed to every begin-end pair to allow for - * forward progress guarantees with-in prolonged active sections. - * - * We can't include ck_epoch.h so we define our own variant here and - * then CTASSERT that it's the same size in subr_epoch.c - */ -struct epoch_section { - unsigned int bucket; -}; -typedef struct epoch_section epoch_section_t; - -/* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct socket; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; struct vm_map; struct vm_map_entry; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_epochnest u_char td_epochnest; /* (k) Epoch nest counter. */ int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_sx_slocks; /* (k) Count of sx shared locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ u_char td_pre_epoch_prio; /* (k) User pri on entry to epoch */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ /* LP64 hole */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ /* LP64 hole */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ - TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */ - epoch_section_t td_epoch_section; /* (t) epoch section object */ int td_pmcpend; }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) do { \ KASSERT(SCHEDULER_STOPPED_TD(td) || (td)->td_locks > 0, \ ("thread %p owns no locks", (td))); \ (td)->td_locks--; \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ #define TDB_STEP 0x00002000 /* (x86) PSL_T set for PT_STEP */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ u_int p_ptevents; /* (c + e) ptrace() event mask. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ int p_pdeathsig; /* (c) Signal from parent on exit. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint16_t p_elf_machine; /* (x) ELF machine type */ uint64_t p_elf_flags; /* (x) ELF flags */ /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has been re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_UNUSED3 0x00008 /* --available-- */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeothergids(struct ucred *u1, struct ucred *u2); int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); int cr_canseejailproc(struct ucred *u1, struct ucred *u2); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, int *resident_count, bool *super); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); void proc_set_traced(struct proc *p, bool stop); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } extern void (*softdep_ast_cleanup)(struct thread *); static __inline void td_softdep_cleanup(struct thread *td) { if (td->td_su != NULL && softdep_ast_cleanup != NULL) softdep_ast_cleanup(td); } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */