Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c =================================================================== --- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c (revision 313820) +++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c (revision 313821) @@ -1,1686 +1,1692 @@ /************************************************************************** Copyright (c) 2007, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTR static char *states[] = { "idle", "listen", "connecting", "mpa_wait_req", "mpa_req_sent", "mpa_req_rcvd", "mpa_rep_sent", "fpdu_mode", "aborting", "closing", "moribund", "dead", NULL, }; #endif SYSCTL_NODE(_hw, OID_AUTO, iw_cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters"); static int ep_timeout_secs = 60; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RWTUN, &ep_timeout_secs, 0, "CM Endpoint operation timeout in seconds (default=60)"); static int mpa_rev = 1; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RWTUN, &mpa_rev, 0, "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)"); static int markers_enabled = 0; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RWTUN, &markers_enabled, 0, "Enable MPA MARKERS (default(0)=disabled)"); static int crc_enabled = 1; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RWTUN, &crc_enabled, 0, "Enable MPA CRC (default(1)=enabled)"); static int rcv_win = 256 * 1024; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RWTUN, &rcv_win, 0, "TCP receive window in bytes (default=256KB)"); static int snd_win = 32 * 1024; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, snd_win, CTLFLAG_RWTUN, &snd_win, 0, "TCP send window in bytes (default=32KB)"); static unsigned int nocong = 0; SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, nocong, CTLFLAG_RWTUN, &nocong, 0, "Turn off congestion control (default=0)"); static unsigned int cong_flavor = 1; SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RWTUN, &cong_flavor, 0, "TCP Congestion control flavor (default=1)"); static void ep_timeout(void *arg); static void connect_reply_upcall(struct iwch_ep *ep, int status); static int iwch_so_upcall(struct socket *so, void *arg, int waitflag); /* * Cruft to offload socket upcalls onto thread. */ static struct mtx req_lock; static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list; static struct task iw_cxgb_task; static struct taskqueue *iw_cxgb_taskq; static void process_req(void *ctx, int pending); static void start_ep_timer(struct iwch_ep *ep) { CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); if (callout_pending(&ep->timer)) { CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep); callout_deactivate(&ep->timer); callout_drain(&ep->timer); } else { /* * XXX this looks racy */ get_ep(&ep->com); callout_init(&ep->timer, 1); } callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep); } static void stop_ep_timer(struct iwch_ep *ep) { CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); if (!callout_pending(&ep->timer)) { CTR3(KTR_IW_CXGB, "%s timer stopped when its not running! ep %p state %u\n", __func__, ep, ep->com.state); return; } callout_drain(&ep->timer); put_ep(&ep->com); } static int set_tcpinfo(struct iwch_ep *ep) { struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; struct toepcb *toep; int rc = 0; INP_WLOCK(inp); tp = intotcpcb(inp); if ((tp->t_flags & TF_TOE) == 0) { rc = EINVAL; printf("%s: connection NOT OFFLOADED!\n", __func__); goto done; } toep = tp->t_toe; ep->hwtid = toep->tp_tid; ep->snd_seq = tp->snd_nxt; ep->rcv_seq = tp->rcv_nxt; ep->emss = tp->t_maxseg; if (ep->emss < 128) ep->emss = 128; done: INP_WUNLOCK(inp); return (rc); } static enum iwch_ep_state state_read(struct iwch_ep_common *epc) { enum iwch_ep_state state; mtx_lock(&epc->lock); state = epc->state; mtx_unlock(&epc->lock); return state; } static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) { epc->state = new; } static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) { mtx_lock(&epc->lock); CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]); __state_set(epc, new); mtx_unlock(&epc->lock); return; } static void * alloc_ep(int size, int flags) { struct iwch_ep_common *epc; epc = malloc(size, M_DEVBUF, flags); if (epc) { memset(epc, 0, size); refcount_init(&epc->refcount, 1); mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK); cv_init(&epc->waitq, "iwch_epc cv"); } CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc); return epc; } void __free_ep(struct iwch_ep_common *epc) { CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]); KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc)); free(epc, M_DEVBUF); } static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos, struct nhop4_extended *pnh4) { struct in_addr addr; addr.s_addr = peer_ip; return (fib4_lookup_nh_ext(RT_DEFAULT_FIB, addr, NHR_REF, 0, pnh4)); } static void close_socket(struct iwch_ep_common *epc, int close) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); SOCK_LOCK(epc->so); soupcall_clear(epc->so, SO_RCV); SOCK_UNLOCK(epc->so); if (close) soclose(epc->so); else soshutdown(epc->so, SHUT_WR|SHUT_RD); epc->so = NULL; } static void shutdown_socket(struct iwch_ep_common *epc) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); soshutdown(epc->so, SHUT_WR); } static void abort_socket(struct iwch_ep *ep) { struct sockopt sopt; int err; struct linger l; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); l.l_onoff = 1; l.l_linger = 0; /* linger_time of 0 forces RST to be sent */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_LINGER; sopt.sopt_val = (caddr_t)&l; sopt.sopt_valsize = sizeof l; sopt.sopt_td = NULL; err = sosetopt(ep->com.so, &sopt); if (err) printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err); } static void send_mpa_req(struct iwch_ep *ep) { int mpalen; struct mpa_message *mpa; struct mbuf *m; int err; CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen); mpalen = sizeof(*mpa) + ep->plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { connect_reply_upcall(ep, -ENOMEM); return; } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); mpa->flags = (crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->private_data_size = htons(ep->plen); mpa->revision = mpa_rev; if (ep->plen) memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (err) { m_freem(m); connect_reply_upcall(ep, -ENOMEM); return; } start_ep_timer(ep); state_set(&ep->com, MPA_REQ_SENT); return; } static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; int err; CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen); mpalen = sizeof(*mpa) + plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { printf("%s - cannot alloc mbuf!\n", __FUNCTION__); return (-ENOMEM); } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = MPA_REJECT; mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (plen) memcpy(mpa->private_data, pdata, plen); err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); PANIC_IF(err); return 0; } static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen); mpalen = sizeof(*mpa) + plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { printf("%s - cannot alloc mbuf!\n", __FUNCTION__); return (-ENOMEM); } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (plen) memcpy(mpa->private_data, pdata, plen); state_set(&ep->com, MPA_REP_SENT); return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); } static void close_complete_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void abort_connection(struct iwch_ep *ep) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); state_set(&ep->com, ABORTING); abort_socket(ep); close_socket(&ep->com, 0); close_complete_upcall(ep); state_set(&ep->com, DEAD); put_ep(&ep->com); } static void peer_close_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_DISCONNECT; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } } static void peer_abort_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = ECONNRESET; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void connect_reply_upcall(struct iwch_ep *ep, int status) { struct iw_cm_event event; CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = status; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; if ((status == 0) || (status == ECONNREFUSED)) { event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } if (ep->com.cm_id) { CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep, ep->hwtid, status); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } if (status < 0) { ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void connect_request_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; event.so = ep->com.so; if (state_read(&ep->parent_ep->com) != DEAD) { get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( ep->parent_ep->com.cm_id, &event); } put_ep(&ep->parent_ep->com); } static void established_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } } static void process_mpa_reply(struct iwch_ep *ep) { struct mpa_message *mpa; u16 plen; struct iwch_qp_attributes attrs; enum iwch_qp_attr_mask mask; int err; struct mbuf *top, *m; int flags = MSG_DONTWAIT; struct uio uio; int len; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); /* * Stop mpa timer. If it expired, then the state has * changed and we bail since ep_timeout already aborted * the connection. */ stop_ep_timer(ep); if (state_read(&ep->com) != MPA_REQ_SENT) return; uio.uio_resid = len = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { start_ep_timer(ep); return; } err = -err; goto err; } if (ep->com.so->so_rcv.sb_mb) { printf("%s data after soreceive called! so %p sb_mb %p top %p\n", __FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top); } m = top; do { /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { err = (-EINVAL); goto err; } /* * copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) return; mpa = (struct mpa_message *)ep->mpa_pkt; /* Validate MPA header. */ if (mpa->revision != mpa_rev) { CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); err = EPROTO; goto err; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); err = EPROTO; goto err; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); err = EPROTO; goto err; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len); err = EPROTO; goto err; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) return; if (mpa->flags & MPA_REJECT) { err = ECONNREFUSED; goto err; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. */ CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__); state_set(&ep->com, FPDU_MODE); ep->mpa_attr.initiator = 1; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa_rev; if (set_tcpinfo(ep)) { printf("%s set_tcpinfo error\n", __FUNCTION__); goto err; } CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " "xmit_marker_enabled=%d, version=%d", __FUNCTION__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = IWCH_QP_STATE_RTS; mask = IWCH_QP_ATTR_NEXT_STATE | IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; /* bind QP and TID with INIT_WR */ err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (!err) goto out; err: abort_connection(ep); out: connect_reply_upcall(ep, err); return; } static void process_mpa_request(struct iwch_ep *ep) { struct mpa_message *mpa; u16 plen; int flags = MSG_DONTWAIT; struct mbuf *top, *m; int err; struct uio uio; int len; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); /* * Stop mpa timer. If it expired, then the state has * changed and we bail since ep_timeout already aborted * the connection. */ stop_ep_timer(ep); if (state_read(&ep->com) != MPA_REQ_WAIT) return; uio.uio_resid = len = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { start_ep_timer(ep); return; } err = -err; goto err; } m = top; do { /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__, ep->mpa_pkt_len + m->m_len); goto err; } /* * Copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * If we don't even have the mpa message, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < sizeof(*mpa)) { start_ep_timer(ep); CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__, ep->mpa_pkt_len); return; } mpa = (struct mpa_message *) ep->mpa_pkt; /* * Validate MPA Header. */ if (mpa->revision != mpa_rev) { CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); goto err; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); goto err; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); goto err; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__, ep->mpa_pkt_len); goto err; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) { start_ep_timer(ep); CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__, ep->mpa_pkt_len); return; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. */ ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa_rev; if (set_tcpinfo(ep)) { printf("%s set_tcpinfo error\n", __FUNCTION__); goto err; } CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " "xmit_marker_enabled=%d, version=%d", __FUNCTION__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); state_set(&ep->com, MPA_REQ_RCVD); /* drive upcall */ connect_request_upcall(ep); return; err: abort_connection(ep); return; } static void process_peer_close(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int disconnect = 1; int release = 0; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); mtx_lock(&ep->com.lock); switch (ep->com.state) { case MPA_REQ_WAIT: __state_set(&ep->com, CLOSING); break; case MPA_REQ_SENT: __state_set(&ep->com, CLOSING); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ __state_set(&ep->com, CLOSING); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); break; case FPDU_MODE: start_ep_timer(ep); __state_set(&ep->com, CLOSING); attrs.next_state = IWCH_QP_STATE_CLOSING; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); peer_close_upcall(ep); break; case ABORTING: disconnect = 0; break; case CLOSING: __state_set(&ep->com, MORIBUND); disconnect = 0; break; case MORIBUND: stop_ep_timer(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = IWCH_QP_STATE_IDLE; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; disconnect = 0; break; case DEAD: disconnect = 0; break; default: PANIC_IF(1); } mtx_unlock(&ep->com.lock); if (disconnect) iwch_ep_disconnect(ep, 0, M_NOWAIT); if (release) put_ep(&ep->com); return; } static void process_conn_error(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int ret; mtx_lock(&ep->com.lock); CTR3(KTR_IW_CXGB, "%s ep %p state %u", __func__, ep, ep->com.state); switch (ep->com.state) { case MPA_REQ_WAIT: stop_ep_timer(ep); break; case MPA_REQ_SENT: stop_ep_timer(ep); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: ep->com.rpl_err = ECONNRESET; CTR1(KTR_IW_CXGB, "waking up ep %p", ep); break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ break; case MORIBUND: case CLOSING: stop_ep_timer(ep); /*FALLTHROUGH*/ case FPDU_MODE: if (ep->com.cm_id && ep->com.qp) { attrs.next_state = IWCH_QP_STATE_ERROR; ret = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); if (ret) log(LOG_ERR, "%s - qp <- error failed!\n", __FUNCTION__); } peer_abort_upcall(ep); break; case ABORTING: break; case DEAD: mtx_unlock(&ep->com.lock); CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, ep->com.so->so_error); return; default: PANIC_IF(1); break; } if (ep->com.state != ABORTING) { close_socket(&ep->com, 0); __state_set(&ep->com, DEAD); put_ep(&ep->com); } mtx_unlock(&ep->com.lock); return; } static void process_close_complete(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int release = 0; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); PANIC_IF(!ep); /* The cm_id may be null if we failed to connect */ mtx_lock(&ep->com.lock); switch (ep->com.state) { case CLOSING: __state_set(&ep->com, MORIBUND); break; case MORIBUND: stop_ep_timer(ep); if ((ep->com.cm_id) && (ep->com.qp)) { attrs.next_state = IWCH_QP_STATE_IDLE; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } if (ep->parent_ep) close_socket(&ep->com, 1); else close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; break; case ABORTING: break; case DEAD: default: PANIC_IF(1); break; } mtx_unlock(&ep->com.lock); if (release) put_ep(&ep->com); return; } /* * T3A does 3 things when a TERM is received: * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet * 2) generate an async event on the QP with the TERMINATE opcode * 3) post a TERMINATE opcde cqe into the associated CQ. * * For (1), we save the message in the qp for later consumer consumption. * For (2), we move the QP into TERMINATE, post a QP event and disconnect. * For (3), we toss the CQE in cxio_poll_cq(). * * terminate() handles case (1)... */ static int terminate(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; uint32_t hash = *((uint32_t *)r + 1); unsigned int tid = ntohl(hash) >> 8 & 0xfffff; struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct socket *so = toep->tp_inp->inp_socket; struct iwch_ep *ep = so->so_rcv.sb_upcallarg; if (state_read(&ep->com) != FPDU_MODE) goto done; m_adj(m, sizeof(struct cpl_rdma_terminate)); CTR4(KTR_IW_CXGB, "%s: tid %u, ep %p, saved %d bytes", __func__, tid, ep, m->m_len); m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer); ep->com.qp->attr.terminate_msg_len = m->m_len; ep->com.qp->attr.is_terminate_local = 0; done: m_freem(m); return (0); } static int ec_status(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rdma_ec_status *rep = mtod(m, void *); unsigned int tid = GET_TID(rep); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct socket *so = toep->tp_inp->inp_socket; struct iwch_ep *ep = so->so_rcv.sb_upcallarg; if (rep->status) { struct iwch_qp_attributes attrs; CTR1(KTR_IW_CXGB, "%s BAD CLOSE - Aborting", __FUNCTION__); stop_ep_timer(ep); attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); abort_connection(ep); } m_freem(m); return (0); } static void ep_timeout(void *arg) { struct iwch_ep *ep = (struct iwch_ep *)arg; struct iwch_qp_attributes attrs; int err = 0; int abort = 1; mtx_lock(&ep->com.lock); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); switch (ep->com.state) { case MPA_REQ_SENT: __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: __state_set(&ep->com, ABORTING); break; case CLOSING: case MORIBUND: if (ep->com.cm_id && ep->com.qp) err = 1; __state_set(&ep->com, ABORTING); break; default: CTR3(KTR_IW_CXGB, "%s unexpected state ep %p state %u\n", __func__, ep, ep->com.state); abort = 0; } mtx_unlock(&ep->com.lock); if (err){ attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } if (abort) abort_connection(ep); put_ep(&ep->com); } int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { int err; struct iwch_ep *ep = to_ep(cm_id); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if (state_read(&ep->com) == DEAD) { put_ep(&ep->com); return (-ECONNRESET); } PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); if (mpa_rev == 0) { abort_connection(ep); } else { err = send_mpa_reject(ep, pdata, pdata_len); err = soshutdown(ep->com.so, 3); } put_ep(&ep->com); return 0; } int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err; struct iwch_qp_attributes attrs; enum iwch_qp_attr_mask mask; struct iwch_ep *ep = to_ep(cm_id); struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_qp *qp = get_qhp(h, conn_param->qpn); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if (state_read(&ep->com) == DEAD) { err = -ECONNRESET; goto err; } PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); PANIC_IF(!qp); if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { abort_connection(ep); err = -EINVAL; goto err; } cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = qp; ep->com.rpl_err = 0; ep->com.rpl_done = 0; ep->ird = conn_param->ird; ep->ord = conn_param->ord; CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord); /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = IWCH_QP_STATE_RTS; /* bind QP and TID with INIT_WR */ mask = IWCH_QP_ATTR_NEXT_STATE | IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) goto err1; err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) goto err1; state_set(&ep->com, FPDU_MODE); established_upcall(ep); put_ep(&ep->com); return 0; err1: ep->com.cm_id = NULL; ep->com.qp = NULL; cm_id->rem_ref(cm_id); err: put_ep(&ep->com); return err; } static int init_sock(struct iwch_ep_common *epc) { int err; struct sockopt sopt; int on=1; SOCK_LOCK(epc->so); soupcall_set(epc->so, SO_RCV, iwch_so_upcall, epc); epc->so->so_state |= SS_NBIO; SOCK_UNLOCK(epc->so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; sopt.sopt_valsize = sizeof on; sopt.sopt_td = NULL; err = sosetopt(epc->so, &sopt); if (err) printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err); return 0; } static int is_loopback_dst(struct iw_cm_id *cm_id) { uint16_t port = cm_id->remote_addr.sin_port; int ifa_present; cm_id->remote_addr.sin_port = 0; ifa_present = ifa_ifwithaddr_check( (struct sockaddr *)&cm_id->remote_addr); cm_id->remote_addr.sin_port = port; return (ifa_present); } int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err = 0; struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_ep *ep; struct nhop4_extended nh4; struct toedev *tdev; if (is_loopback_dst(cm_id)) { err = -ENOSYS; goto out; } ep = alloc_ep(sizeof(*ep), M_NOWAIT); if (!ep) { printf("%s - cannot alloc ep.\n", __FUNCTION__); err = (-ENOMEM); goto out; } callout_init(&ep->timer, 1); ep->plen = conn_param->private_data_len; if (ep->plen) memcpy(ep->mpa_pkt + sizeof(struct mpa_message), conn_param->private_data, ep->plen); ep->ird = conn_param->ird; ep->ord = conn_param->ord; cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = get_qhp(h, conn_param->qpn); ep->com.thread = curthread; PANIC_IF(!ep->com.qp); CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn, ep->com.qp, cm_id); ep->com.so = cm_id->so; err = init_sock(&ep->com); if (err) goto fail2; /* find a route */ err = find_route(cm_id->local_addr.sin_addr.s_addr, cm_id->remote_addr.sin_addr.s_addr, cm_id->local_addr.sin_port, cm_id->remote_addr.sin_port, IPTOS_LOWDELAY, &nh4); if (err) { printf("%s - cannot find route.\n", __FUNCTION__); err = EHOSTUNREACH; goto fail2; } if (!(nh4.nh_ifp->if_flags & IFCAP_TOE)) { printf("%s - interface not TOE capable.\n", __FUNCTION__); fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } tdev = TOEDEV(nh4.nh_ifp); if (tdev == NULL) { printf("%s - No toedev for interface.\n", __FUNCTION__); fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); state_set(&ep->com, CONNECTING); ep->com.local_addr = cm_id->local_addr; ep->com.remote_addr = cm_id->remote_addr; err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, ep->com.thread); if (!err) goto out; fail2: put_ep(&ep->com); out: return err; } int iwch_create_listen_ep(struct iw_cm_id *cm_id, int backlog) { int err = 0; struct iwch_listen_ep *ep; ep = alloc_ep(sizeof(*ep), M_NOWAIT); if (!ep) { printf("%s - cannot alloc ep.\n", __FUNCTION__); err = ENOMEM; goto out; } CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->backlog = backlog; ep->com.local_addr = cm_id->local_addr; ep->com.thread = curthread; state_set(&ep->com, LISTEN); ep->com.so = cm_id->so; cm_id->provider_data = ep; out: return err; } void iwch_destroy_listen_ep(struct iw_cm_id *cm_id) { struct iwch_listen_ep *ep = to_listen_ep(cm_id); CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); state_set(&ep->com, DEAD); cm_id->rem_ref(cm_id); put_ep(&ep->com); return; } int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags) { int close = 0; mtx_lock(&ep->com.lock); PANIC_IF(!ep); PANIC_IF(!ep->com.so); CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], abrupt); switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: close = 1; if (abrupt) ep->com.state = ABORTING; else { ep->com.state = CLOSING; start_ep_timer(ep); } break; case CLOSING: close = 1; if (abrupt) { stop_ep_timer(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; break; case MORIBUND: case ABORTING: case DEAD: CTR3(KTR_IW_CXGB, "%s ignoring disconnect ep %p state %u\n", __func__, ep, ep->com.state); break; default: panic("unknown state: %d\n", ep->com.state); break; } mtx_unlock(&ep->com.lock); if (close) { if (abrupt) abort_connection(ep); else { if (!ep->parent_ep) __state_set(&ep->com, MORIBUND); shutdown_socket(&ep->com); } } return 0; } static void process_data(struct iwch_ep *ep) { struct sockaddr_in *local, *remote; +#ifdef KTR + char local_str[INET_ADDRSTRLEN], remote_str[INET_ADDRSTRLEN]; +#endif CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); switch (state_read(&ep->com)) { case MPA_REQ_SENT: process_mpa_reply(ep); break; case MPA_REQ_WAIT: /* * XXX * Set local and remote addrs here because when we * dequeue the newly accepted socket, they aren't set * yet in the pcb! */ in_getsockaddr(ep->com.so, (struct sockaddr **)&local); in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote); CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__, - inet_ntoa(local->sin_addr), - inet_ntoa(remote->sin_addr)); + inet_ntoa_r(local->sin_addr, local_str), + inet_ntoa_r(remote->sin_addr, remote_str)); ep->com.local_addr = *local; ep->com.remote_addr = *remote; free(local, M_SONAME); free(remote, M_SONAME); process_mpa_request(ep); break; default: if (sbavail(&ep->com.so->so_rcv)) printf("%s Unexpected streaming data." " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n", __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state, sbavail(&ep->com.so->so_rcv), ep->com.so->so_rcv.sb_mb); break; } return; } static void process_connected(struct iwch_ep *ep) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) { send_mpa_req(ep); } else { connect_reply_upcall(ep, -ep->com.so->so_error); close_socket(&ep->com, 0); state_set(&ep->com, DEAD); put_ep(&ep->com); } } void process_newconn(struct iw_cm_id *parent_cm_id, struct socket *child_so) { struct iwch_ep *child_ep; struct sockaddr_in *local; struct sockaddr_in *remote; struct iwch_ep *parent_ep = parent_cm_id->provider_data; +#ifdef KTR + char buf[INET_ADDRSTRLEN]; +#endif CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so); if (!child_so) { log(LOG_ERR, "%s - invalid child socket!\n", __func__); return; } child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT); if (!child_ep) { log(LOG_ERR, "%s - failed to allocate ep entry!\n", __FUNCTION__); return; } SOCKBUF_LOCK(&child_so->so_rcv); soupcall_set(child_so, SO_RCV, iwch_so_upcall, child_ep); SOCKBUF_UNLOCK(&child_so->so_rcv); in_getsockaddr(child_so, (struct sockaddr **)&local); in_getpeeraddr(child_so, (struct sockaddr **)&remote); CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, - inet_ntoa(remote->sin_addr), ntohs(remote->sin_port)); + inet_ntoa_r(remote->sin_addr, buf), ntohs(remote->sin_port)); child_ep->com.tdev = parent_ep->com.tdev; child_ep->com.local_addr.sin_family = parent_ep->com.local_addr.sin_family; child_ep->com.local_addr.sin_port = parent_ep->com.local_addr.sin_port; child_ep->com.local_addr.sin_addr.s_addr = parent_ep->com.local_addr.sin_addr.s_addr; child_ep->com.local_addr.sin_len = parent_ep->com.local_addr.sin_len; child_ep->com.remote_addr.sin_family = remote->sin_family; child_ep->com.remote_addr.sin_port = remote->sin_port; child_ep->com.remote_addr.sin_addr.s_addr = remote->sin_addr.s_addr; child_ep->com.remote_addr.sin_len = remote->sin_len; child_ep->com.so = child_so; child_ep->com.cm_id = NULL; child_ep->com.thread = parent_ep->com.thread; child_ep->parent_ep = parent_ep; free(local, M_SONAME); free(remote, M_SONAME); get_ep(&parent_ep->com); callout_init(&child_ep->timer, 1); state_set(&child_ep->com, MPA_REQ_WAIT); start_ep_timer(child_ep); /* maybe the request has already been queued up on the socket... */ process_mpa_request(child_ep); } static int iwch_so_upcall(struct socket *so, void *arg, int waitflag) { struct iwch_ep *ep = arg; CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); mtx_lock(&req_lock); if (ep && ep->com.so && !ep->com.entry.tqe_prev) { get_ep(&ep->com); TAILQ_INSERT_TAIL(&req_list, &ep->com, entry); taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task); } mtx_unlock(&req_lock); return (SU_OK); } static void process_socket_event(struct iwch_ep *ep) { int state = state_read(&ep->com); struct socket *so = ep->com.so; CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); if (state == CONNECTING) { process_connected(ep); return; } if (state == LISTEN) { /* socket listening events are handled at IWCM */ CTR3(KTR_IW_CXGB, "%s Invalid ep state:%u, ep:%p", __func__, ep->com.state, ep); BUG(); return; } /* connection error */ if (so->so_error) { process_conn_error(ep); return; } /* peer close */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) { process_peer_close(ep); return; } /* close complete */ if (so->so_state & (SS_ISDISCONNECTED)) { process_close_complete(ep); return; } /* rx data */ process_data(ep); return; } static void process_req(void *ctx, int pending) { struct iwch_ep_common *epc; CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__); mtx_lock(&req_lock); while (!TAILQ_EMPTY(&req_list)) { epc = TAILQ_FIRST(&req_list); TAILQ_REMOVE(&req_list, epc, entry); epc->entry.tqe_prev = NULL; mtx_unlock(&req_lock); if (epc->so) process_socket_event((struct iwch_ep *)epc); put_ep(epc); mtx_lock(&req_lock); } mtx_unlock(&req_lock); } int iwch_cm_init(void) { TAILQ_INIT(&req_list); mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF); iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &iw_cxgb_taskq); if (iw_cxgb_taskq == NULL) { printf("failed to allocate iw_cxgb taskqueue\n"); return (ENOMEM); } taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq"); TASK_INIT(&iw_cxgb_task, 0, process_req, NULL); return (0); } void iwch_cm_term(void) { taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task); taskqueue_free(iw_cxgb_taskq); } void iwch_cm_init_cpl(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, terminate); t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, ec_status); } void iwch_cm_term_cpl(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, NULL); t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, NULL); } #endif Index: head/sys/fs/nfsserver/nfs_nfsdkrpc.c =================================================================== --- head/sys/fs/nfsserver/nfs_nfsdkrpc.c (revision 313820) +++ head/sys/fs/nfsserver/nfs_nfsdkrpc.c (revision 313821) @@ -1,566 +1,570 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_kgssapi.h" #include #include #include #include #include #include NFSDLOCKMUTEX; NFSV4ROOTLOCKMUTEX; struct nfsv4lock nfsd_suspend_lock; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int newnfs_nfsv3_procid[NFS_V3NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, }; SYSCTL_DECL(_vfs_nfsd); SVCPOOL *nfsrvd_pool; static int nfs_privport = 0; SYSCTL_INT(_vfs_nfsd, OID_AUTO, nfs_privport, CTLFLAG_RWTUN, &nfs_privport, 0, "Only allow clients using a privileged port for NFSv2 and 3"); static int nfs_minvers = NFS_VER2; SYSCTL_INT(_vfs_nfsd, OID_AUTO, server_min_nfsvers, CTLFLAG_RWTUN, &nfs_minvers, 0, "The lowest version of NFS handled by the server"); static int nfs_maxvers = NFS_VER4; SYSCTL_INT(_vfs_nfsd, OID_AUTO, server_max_nfsvers, CTLFLAG_RWTUN, &nfs_maxvers, 0, "The highest version of NFS handled by the server"); static int nfs_proc(struct nfsrv_descript *, u_int32_t, SVCXPRT *xprt, struct nfsrvcache **); extern u_long sb_max_adj; extern int newnfs_numnfsd; extern struct proc *nfsd_master_proc; /* * NFS server system calls */ static void nfssvc_program(struct svc_req *rqst, SVCXPRT *xprt) { struct nfsrv_descript nd; struct nfsrvcache *rp = NULL; int cacherep, credflavor; memset(&nd, 0, sizeof(nd)); if (rqst->rq_vers == NFS_VER2) { if (rqst->rq_proc > NFSV2PROC_STATFS || newnfs_nfsv3_procid[rqst->rq_proc] == NFSPROC_NOOP) { svcerr_noproc(rqst); svc_freereq(rqst); goto out; } nd.nd_procnum = newnfs_nfsv3_procid[rqst->rq_proc]; nd.nd_flag = ND_NFSV2; } else if (rqst->rq_vers == NFS_VER3) { if (rqst->rq_proc >= NFS_V3NPROCS) { svcerr_noproc(rqst); svc_freereq(rqst); goto out; } nd.nd_procnum = rqst->rq_proc; nd.nd_flag = ND_NFSV3; } else { if (rqst->rq_proc != NFSPROC_NULL && rqst->rq_proc != NFSV4PROC_COMPOUND) { svcerr_noproc(rqst); svc_freereq(rqst); goto out; } nd.nd_procnum = rqst->rq_proc; nd.nd_flag = ND_NFSV4; } /* * Note: we want rq_addr, not svc_getrpccaller for nd_nam2 - * NFS_SRVMAXDATA uses a NULL value for nd_nam2 to detect TCP * mounts. */ nd.nd_mrep = rqst->rq_args; rqst->rq_args = NULL; newnfs_realign(&nd.nd_mrep, M_WAITOK); nd.nd_md = nd.nd_mrep; nd.nd_dpos = mtod(nd.nd_md, caddr_t); nd.nd_nam = svc_getrpccaller(rqst); nd.nd_nam2 = rqst->rq_addr; nd.nd_mreq = NULL; nd.nd_cred = NULL; if (nfs_privport && (nd.nd_flag & ND_NFSV4) == 0) { /* Check if source port is privileged */ u_short port; struct sockaddr *nam = nd.nd_nam; struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; /* * INET/INET6 - same code: * sin_port and sin6_port are at same offset */ port = ntohs(sin->sin_port); if (port >= IPPORT_RESERVED && nd.nd_procnum != NFSPROC_NULL) { #ifdef INET6 - char b6[INET6_ADDRSTRLEN]; + char buf[INET6_ADDRSTRLEN]; +#else + char buf[INET_ADDRSTRLEN]; +#endif +#ifdef INET6 #if defined(KLD_MODULE) /* Do not use ip6_sprintf: the nfs module should work without INET6. */ #define ip6_sprintf(buf, a) \ (sprintf((buf), "%x:%x:%x:%x:%x:%x:%x:%x", \ (a)->s6_addr16[0], (a)->s6_addr16[1], \ (a)->s6_addr16[2], (a)->s6_addr16[3], \ (a)->s6_addr16[4], (a)->s6_addr16[5], \ (a)->s6_addr16[6], (a)->s6_addr16[7]), \ (buf)) #endif #endif printf("NFS request from unprivileged port (%s:%d)\n", #ifdef INET6 sin->sin_family == AF_INET6 ? - ip6_sprintf(b6, &satosin6(sin)->sin6_addr) : + ip6_sprintf(buf, &satosin6(sin)->sin6_addr) : #if defined(KLD_MODULE) #undef ip6_sprintf #endif #endif - inet_ntoa(sin->sin_addr), port); + inet_ntoa_r(sin->sin_addr, buf), port); svcerr_weakauth(rqst); svc_freereq(rqst); m_freem(nd.nd_mrep); goto out; } } if (nd.nd_procnum != NFSPROC_NULL) { if (!svc_getcred(rqst, &nd.nd_cred, &credflavor)) { svcerr_weakauth(rqst); svc_freereq(rqst); m_freem(nd.nd_mrep); goto out; } /* Set the flag based on credflavor */ if (credflavor == RPCSEC_GSS_KRB5) { nd.nd_flag |= ND_GSS; } else if (credflavor == RPCSEC_GSS_KRB5I) { nd.nd_flag |= (ND_GSS | ND_GSSINTEGRITY); } else if (credflavor == RPCSEC_GSS_KRB5P) { nd.nd_flag |= (ND_GSS | ND_GSSPRIVACY); } else if (credflavor != AUTH_SYS) { svcerr_weakauth(rqst); svc_freereq(rqst); m_freem(nd.nd_mrep); goto out; } #ifdef MAC mac_cred_associate_nfsd(nd.nd_cred); #endif /* * Get a refcnt (shared lock) on nfsd_suspend_lock. * NFSSVC_SUSPENDNFSD will take an exclusive lock on * nfsd_suspend_lock to suspend these threads. * The call to nfsv4_lock() that precedes nfsv4_getref() * ensures that the acquisition of the exclusive lock * takes priority over acquisition of the shared lock by * waiting for any exclusive lock request to complete. * This must be done here, before the check of * nfsv4root exports by nfsvno_v4rootexport(). */ NFSLOCKV4ROOTMUTEX(); nfsv4_lock(&nfsd_suspend_lock, 0, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL); nfsv4_getref(&nfsd_suspend_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL); NFSUNLOCKV4ROOTMUTEX(); if ((nd.nd_flag & ND_NFSV4) != 0) { nd.nd_repstat = nfsvno_v4rootexport(&nd); if (nd.nd_repstat != 0) { NFSLOCKV4ROOTMUTEX(); nfsv4_relref(&nfsd_suspend_lock); NFSUNLOCKV4ROOTMUTEX(); svcerr_weakauth(rqst); svc_freereq(rqst); m_freem(nd.nd_mrep); goto out; } } cacherep = nfs_proc(&nd, rqst->rq_xid, xprt, &rp); NFSLOCKV4ROOTMUTEX(); nfsv4_relref(&nfsd_suspend_lock); NFSUNLOCKV4ROOTMUTEX(); } else { NFSMGET(nd.nd_mreq); nd.nd_mreq->m_len = 0; cacherep = RC_REPLY; } if (nd.nd_mrep != NULL) m_freem(nd.nd_mrep); if (nd.nd_cred != NULL) crfree(nd.nd_cred); if (cacherep == RC_DROPIT) { if (nd.nd_mreq != NULL) m_freem(nd.nd_mreq); svc_freereq(rqst); goto out; } if (nd.nd_mreq == NULL) { svcerr_decode(rqst); svc_freereq(rqst); goto out; } if (nd.nd_repstat & NFSERR_AUTHERR) { svcerr_auth(rqst, nd.nd_repstat & ~NFSERR_AUTHERR); if (nd.nd_mreq != NULL) m_freem(nd.nd_mreq); } else if (!svc_sendreply_mbuf(rqst, nd.nd_mreq)) { svcerr_systemerr(rqst); } if (rp != NULL) { nfsrvd_sentcache(rp, (rqst->rq_reply_seq != 0 || SVC_ACK(xprt, NULL)), rqst->rq_reply_seq); } svc_freereq(rqst); out: if (softdep_ast_cleanup != NULL) softdep_ast_cleanup(); NFSEXITCODE(0); } /* * Check the cache and, optionally, do the RPC. * Return the appropriate cache response. */ static int nfs_proc(struct nfsrv_descript *nd, u_int32_t xid, SVCXPRT *xprt, struct nfsrvcache **rpp) { struct thread *td = curthread; int cacherep = RC_DOIT, isdgram, taglen = -1; struct mbuf *m; u_char tag[NFSV4_SMALLSTR + 1], *tagstr = NULL; u_int32_t minorvers = 0; uint32_t ack; *rpp = NULL; if (nd->nd_nam2 == NULL) { nd->nd_flag |= ND_STREAMSOCK; isdgram = 0; } else { isdgram = 1; } /* * Two cases: * 1 - For NFSv2 over UDP, if we are near our malloc/mget * limit, just drop the request. There is no * NFSERR_RESOURCE or NFSERR_DELAY for NFSv2 and the * client will timeout/retry over UDP in a little while. * 2 - nd_repstat == 0 && nd_mreq == NULL, which * means a normal nfs rpc, so check the cache */ if ((nd->nd_flag & ND_NFSV2) && nd->nd_nam2 != NULL && nfsrv_mallocmget_limit()) { cacherep = RC_DROPIT; } else { /* * For NFSv3, play it safe and assume that the client is * doing retries on the same TCP connection. */ if ((nd->nd_flag & (ND_NFSV4 | ND_STREAMSOCK)) == ND_STREAMSOCK) nd->nd_flag |= ND_SAMETCPCONN; nd->nd_retxid = xid; nd->nd_tcpconntime = NFSD_MONOSEC; nd->nd_sockref = xprt->xp_sockref; if ((nd->nd_flag & ND_NFSV4) != 0) nfsd_getminorvers(nd, tag, &tagstr, &taglen, &minorvers); if ((nd->nd_flag & ND_NFSV41) != 0) /* NFSv4.1 caches replies in the session slots. */ cacherep = RC_DOIT; else { cacherep = nfsrvd_getcache(nd); ack = 0; SVC_ACK(xprt, &ack); nfsrc_trimcache(xprt->xp_sockref, ack, 0); } } /* * Handle the request. There are three cases. * RC_DOIT - do the RPC * RC_REPLY - return the reply already created * RC_DROPIT - just throw the request away */ if (cacherep == RC_DOIT) { if ((nd->nd_flag & ND_NFSV41) != 0) nd->nd_xprt = xprt; nfsrvd_dorpc(nd, isdgram, tagstr, taglen, minorvers, td); if ((nd->nd_flag & ND_NFSV41) != 0) { if (nd->nd_repstat != NFSERR_REPLYFROMCACHE && (nd->nd_flag & ND_SAVEREPLY) != 0) { /* Cache a copy of the reply. */ m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK); } else m = NULL; if ((nd->nd_flag & ND_HASSEQUENCE) != 0) nfsrv_cache_session(nd->nd_sessionid, nd->nd_slotid, nd->nd_repstat, &m); if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) nd->nd_repstat = 0; cacherep = RC_REPLY; } else { if (nd->nd_repstat == NFSERR_DONTREPLY) cacherep = RC_DROPIT; else cacherep = RC_REPLY; *rpp = nfsrvd_updatecache(nd); } } if (tagstr != NULL && taglen > NFSV4_SMALLSTR) free(tagstr, M_TEMP); NFSEXITCODE2(0, nd); return (cacherep); } static void nfssvc_loss(SVCXPRT *xprt) { uint32_t ack; ack = 0; SVC_ACK(xprt, &ack); nfsrc_trimcache(xprt->xp_sockref, ack, 1); } /* * Adds a socket to the list for servicing by nfsds. */ int nfsrvd_addsock(struct file *fp) { int siz; struct socket *so; int error = 0; SVCXPRT *xprt; static u_int64_t sockref = 0; so = fp->f_data; siz = sb_max_adj; error = soreserve(so, siz, siz); if (error) goto out; /* * Steal the socket from userland so that it doesn't close * unexpectedly. */ if (so->so_type == SOCK_DGRAM) xprt = svc_dg_create(nfsrvd_pool, so, 0, 0); else xprt = svc_vc_create(nfsrvd_pool, so, 0, 0); if (xprt) { fp->f_ops = &badfileops; fp->f_data = NULL; xprt->xp_sockref = ++sockref; if (nfs_minvers == NFS_VER2) svc_reg(xprt, NFS_PROG, NFS_VER2, nfssvc_program, NULL); if (nfs_minvers <= NFS_VER3 && nfs_maxvers >= NFS_VER3) svc_reg(xprt, NFS_PROG, NFS_VER3, nfssvc_program, NULL); if (nfs_maxvers >= NFS_VER4) svc_reg(xprt, NFS_PROG, NFS_VER4, nfssvc_program, NULL); if (so->so_type == SOCK_STREAM) svc_loss_reg(xprt, nfssvc_loss); SVC_RELEASE(xprt); } out: NFSEXITCODE(error); return (error); } /* * Called by nfssvc() for nfsds. Just loops around servicing rpc requests * until it is killed by a signal. */ int nfsrvd_nfsd(struct thread *td, struct nfsd_nfsd_args *args) { char principal[MAXHOSTNAMELEN + 5]; struct proc *p; int error = 0; bool_t ret2, ret3, ret4; error = copyinstr(args->principal, principal, sizeof (principal), NULL); if (error) goto out; /* * Only the first nfsd actually does any work. The RPC code * adds threads to it as needed. Any extra processes offered * by nfsd just exit. If nfsd is new enough, it will call us * once with a structure that specifies how many threads to * use. */ NFSD_LOCK(); if (newnfs_numnfsd == 0) { p = td->td_proc; PROC_LOCK(p); p->p_flag2 |= P2_AST_SU; PROC_UNLOCK(p); newnfs_numnfsd++; NFSD_UNLOCK(); /* An empty string implies AUTH_SYS only. */ if (principal[0] != '\0') { ret2 = rpc_gss_set_svc_name_call(principal, "kerberosv5", GSS_C_INDEFINITE, NFS_PROG, NFS_VER2); ret3 = rpc_gss_set_svc_name_call(principal, "kerberosv5", GSS_C_INDEFINITE, NFS_PROG, NFS_VER3); ret4 = rpc_gss_set_svc_name_call(principal, "kerberosv5", GSS_C_INDEFINITE, NFS_PROG, NFS_VER4); if (!ret2 || !ret3 || !ret4) printf("nfsd: can't register svc name\n"); } nfsrvd_pool->sp_minthreads = args->minthreads; nfsrvd_pool->sp_maxthreads = args->maxthreads; svc_run(nfsrvd_pool); if (principal[0] != '\0') { rpc_gss_clear_svc_name_call(NFS_PROG, NFS_VER2); rpc_gss_clear_svc_name_call(NFS_PROG, NFS_VER3); rpc_gss_clear_svc_name_call(NFS_PROG, NFS_VER4); } NFSD_LOCK(); newnfs_numnfsd--; nfsrvd_init(1); PROC_LOCK(p); p->p_flag2 &= ~P2_AST_SU; PROC_UNLOCK(p); } NFSD_UNLOCK(); out: NFSEXITCODE(error); return (error); } /* * Initialize the data structures for the server. * Handshake with any new nfsds starting up to avoid any chance of * corruption. */ void nfsrvd_init(int terminating) { NFSD_LOCK_ASSERT(); if (terminating) { nfsd_master_proc = NULL; NFSD_UNLOCK(); nfsrv_freeallbackchannel_xprts(); svcpool_close(nfsrvd_pool); NFSD_LOCK(); } else { NFSD_UNLOCK(); nfsrvd_pool = svcpool_create("nfsd", SYSCTL_STATIC_CHILDREN(_vfs_nfsd)); nfsrvd_pool->sp_rcache = NULL; nfsrvd_pool->sp_assign = fhanew_assign; nfsrvd_pool->sp_done = fha_nd_complete; NFSD_LOCK(); } } Index: head/sys/kern/kern_jail.c =================================================================== --- head/sys/kern/kern_jail.c (revision 313820) +++ head/sys/kern/kern_jail.c (revision 313821) @@ -1,4103 +1,4106 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static char *prison_path(struct prison *pr1, struct prison *pr2); static void prison_remove_one(struct prison *pr); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 #define PD_DEUREF 0x02 #define PD_LOCKED 0x04 #define PD_LIST_SLOCKED 0x08 #define PD_LIST_XLOCKED 0x10 /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static char *pr_flag_names[] = { [0] = "persist", #ifdef INET [7] = "ip4.saddrsel", #endif #ifdef INET6 [8] = "ip6.saddrsel", #endif }; const size_t pr_flag_names_size = sizeof(pr_flag_names); static char *pr_flag_nonames[] = { [0] = "nopersist", #ifdef INET [7] = "ip4.nosaddrsel", #endif #ifdef INET6 [8] = "ip6.nosaddrsel", #endif }; const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames); struct jailsys_flags { const char *name; unsigned disable; unsigned new; } pr_flag_jailsys[] = { { "host", 0, PR_HOST }, #ifdef VIMAGE { "vnet", 0, PR_VNET }, #endif #ifdef INET { "ip4", PR_IP4_USER, PR_IP4_USER }, #endif #ifdef INET6 { "ip6", PR_IP6_USER, PR_IP6_USER }, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); static char *pr_allow_names[] = { "allow.set_hostname", "allow.sysvipc", "allow.raw_sockets", "allow.chflags", "allow.mount", "allow.quotas", "allow.socket_af", "allow.mount.devfs", "allow.mount.nullfs", "allow.mount.zfs", "allow.mount.procfs", "allow.mount.tmpfs", "allow.mount.fdescfs", "allow.mount.linprocfs", "allow.mount.linsysfs", }; const size_t pr_allow_names_size = sizeof(pr_allow_names); static char *pr_allow_nonames[] = { "allow.noset_hostname", "allow.nosysvipc", "allow.noraw_sockets", "allow.nochflags", "allow.nomount", "allow.noquotas", "allow.nosocket_af", "allow.mount.nodevfs", "allow.mount.nonullfs", "allow.mount.nozfs", "allow.mount.noprocfs", "allow.mount.notmpfs", "allow.mount.nofdescfs", "allow.mount.nolinprocfs", "allow.mount.nolinsysfs", }; const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames); #define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_allow_names) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs, fi; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (fi = 0; fi < nitems(pr_allow_names); fi++) { optiov[opt.uio_iovcnt].iov_base = (jail_default_allow & (1 << fi)) ? pr_allow_names[fi] : pr_allow_nonames[fi]; optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct in_addr *ip4; #endif #ifdef INET6 struct in6_addr *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; #if defined(INET) || defined(INET6) struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int born, created, cuflags, descend, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int fi, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; int fullpath_disabled; #if defined(INET) || defined(INET6) int ii, ij; #endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (fi = 0; fi < nitems(pr_flag_names); fi++) { if (pr_flag_names[fi] == NULL) continue; vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi); vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi); } ch_flags |= pr_flags; for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!pr_flag_jailsys[fi].disable) { error = EINVAL; goto done_free; } pr_flags |= pr_flag_jailsys[fi].disable; break; case JAIL_SYS_NEW: pr_flags |= pr_flag_jailsys[fi].new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable; } if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (fi = 0; fi < nitems(pr_allow_names); fi++) { vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi); vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(*ip4); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); bcopy(op, ip4, ip4s * sizeof(*ip4)); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (ip4s > 1) qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), prison_qcmp_v4); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. * * We do not have to care about byte order for these * checks so we will do them in NBO. */ for (ii = 0; ii < ip4s; ii++) { if (ip4[ii].s_addr == INADDR_ANY || ip4[ii].s_addr == INADDR_BROADCAST) { error = EINVAL; goto done_free; } if ((ii+1) < ip4s && (ip4[0].s_addr == ip4[ii+1].s_addr || ip4[ii].s_addr == ip4[ii+1].s_addr)) { error = EINVAL; goto done_free; } } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(*ip6); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); bcopy(op, ip6, ip6s * sizeof(*ip6)); if (ip6s > 1) qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), prison_qcmp_v6); for (ii = 0; ii < ip6s; ii++) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { error = EINVAL; goto done_free; } if ((ii+1) < ip6s && (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) { error = EINVAL; goto done_free; } } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || len >= OSRELEASELEN) { error = EINVAL; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } fullpath_disabled = 0; root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, td); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) path = g_path; else if (error == ENODEV) { /* proceed if sysctl debug.disablefullpath == 1 */ fullpath_disabled = 1; if (len < 2 || (len == 2 && path[0] == '/')) path = NULL; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root, 0); if (fullpath_disabled) { /* Leave room for a real-root full pathname. */ if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/") ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) { error = ENAMETOOLONG; vrele(root); goto done_free; } } } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; ppr = mypr; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); if (jid != 0) { /* * See if a requested jid already exists. There is an * information leak here if the jid exists but is not within * the caller's jail hierarchy. Jail creators will get EEXIST * even though they cannot see the jail, and CREATE | UPDATE * will return ENOENT which is not normally a valid error. */ if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_unlock_list; } pr = prison_find(jid); if (pr != NULL) { ppr = pr->pr_parent; /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { mtx_unlock(&pr->pr_mtx); error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_unlock_list; } if (!prison_ischild(mypr, pr)) { mtx_unlock(&pr->pr_mtx); pr = NULL; } else if (pr->pr_uref == 0) { if (!(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } else if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } if (pr == NULL) { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { mtx_unlock(&pr->pr_mtx); error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_unlock_list; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } mtx_unlock(&ppr->pr_mtx); *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && !strcmp(tpr->pr_name + pnamelen, namelc)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); if (tpr->pr_ref > 0) { /* * Use this jail * for updates. */ if (tpr->pr_uref > 0) { pr = tpr; break; } deadpr = tpr; } mtx_unlock(&tpr->pr_mtx); } else if (tpr->pr_uref > 0) { /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; if (pr != NULL) mtx_unlock(&pr->pr_mtx); vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_unlock_list; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto name_again; } pr = deadpr; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_unlock_list; } /* If there's no prison to update, create a new one and link it in. */ if (pr == NULL) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_unlock_list; } created = 1; mtx_lock(&ppr->pr_mtx); if (ppr->pr_ref == 0) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", prison_name(mypr, ppr)); goto done_unlock_list; } ppr->pr_ref++; ppr->pr_uref++; mtx_unlock(&ppr->pr_mtx); pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); if (jid == 0) { /* Find the next free jid. */ jid = lastprid + 1; findnext: if (jid == JAIL_MAX) jid = 1; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id < jid) continue; if (tpr->pr_id > jid || tpr->pr_ref == 0) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } if (jid == lastprid) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); free(pr, M_PRISON); prison_deref(ppr, PD_DEREF | PD_DEUREF | PD_LIST_XLOCKED); goto done_releroot; } jid++; goto findnext; } lastprid = jid; } else { /* * The jail already has a jid (that did not yet exist), * so just find where to insert it. */ TAILQ_FOREACH(tpr, &allprison, pr_list) if (tpr->pr_id >= jid) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } } if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; pr->pr_parent = ppr; pr->pr_id = jid; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; if (ppr->pr_ip4 != NULL) { pr->pr_ip4s = ppr->pr_ip4s; pr->pr_ip4 = malloc(pr->pr_ip4s * sizeof(struct in_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); } } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; if (ppr->pr_ip6 != NULL) { pr->pr_ip6s = ppr->pr_ip6s; pr->pr_ip6 = malloc(pr->pr_ip6s * sizeof(struct in6_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); } } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strcpy(pr->pr_osrelease, ppr->pr_osrelease); else strcpy(pr->pr_osrelease, osrelstr); LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) { prison_deref(pr, PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* * New prisons do not yet have a reference, because we do not * want others to see the incomplete prison once the * allprison_lock is downgraded. */ } else { created = 0; /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ pr->pr_ref++; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref_locked; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref_locked; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref_locked; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref_locked; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref_locked; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref_locked; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref_locked; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref_locked; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if (ppr->pr_flags & PR_IP4) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. Don't worry * about the parent being unlocked, as any * setting is done with allprison_lock held. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } if (ip4s > 1) { for (ii = ij = 1; ii < ip4s; ii++) { if (ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) continue; for (; ij < ppr->pr_ip4s; ij++) if (ip4[ii].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) break; } if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } } } /* * Check for conflicting IP addresses. We permit them * if there is no more than one IP on each jail. If * there is a duplicate on a jail with more than one * IP stop checking and return error. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP4_USER)) continue; descend = 0; if (tpr->pr_ip4 == NULL || (ip4s == 1 && tpr->pr_ip4s == 1)) continue; for (ii = 0; ii < ip4s; ii++) { if (prison_check_ip4_locked(tpr, &ip4[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref_locked; } } } } #endif #ifdef INET6 if (ip6s > 0) { if (ppr->pr_flags & PR_IP6) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&ip6[0], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } if (ip6s > 1) { for (ii = ij = 1; ii < ip6s; ii++) { if (IN6_ARE_ADDR_EQUAL(&ip6[ii], &ppr->pr_ip6[0])) continue; for (; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL( &ip6[ii], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) break; } if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } } } /* Check for conflicting IP addresses. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP6_USER)) continue; descend = 0; if (tpr->pr_ip6 == NULL || (ip6s == 1 && tpr->pr_ip6s == 1)) continue; for (ii = 0; ii < ip6s; ii++) { if (prison_check_ip6_locked(tpr, &ip6[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref_locked; } } } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref_locked; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } } } if (pr_allow & ~ppr->pr_allow) { error = EPERM; goto done_deref_locked; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) { prison_deref(pr, created ? PD_LIST_XLOCKED : PD_DEREF | PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref_locked; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; free(pr->pr_ip4, M_PRISON); pr->pr_ip4s = ip4s; pr->pr_ip4 = ip4; ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; free(pr->pr_ip6, M_PRISON); pr->pr_ip6s = ip6s; pr->pr_ip6 = ip6; ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ if (fullpath_disabled && path[0] == '/' && strcmp(mypr->pr_path, "/")) snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s", mypr->pr_path, path); else strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } if ((tallow = ch_allow & ~pr_allow)) { /* Clear allow bits in all children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_allow &= ~tallow; } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons * for now, so new ones will remain unseen until after the module * handlers have completed. */ born = pr->pr_uref == 0; if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (pr_flags & PR_PERSIST) { pr->pr_ref++; pr->pr_uref++; } else { pr->pr_ref--; pr->pr_uref--; } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_ip4s; ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_ip6s; ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ sx_downgrade(&allprison_lock); if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) { (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) { if (born) (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); error = do_jail_attach(td, pr); if (error) { vfs_opterror(opts, "attach failed"); if (!created) prison_deref(pr, PD_DEREF); goto done_errmsg; } } #ifdef RACCT if (racct_enable && !created) { if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); prison_racct_modify(pr); if (!(flags & JAIL_ATTACH)) sx_slock(&allprison_lock); } #endif td->td_retval[0] = pr->pr_id; /* * Now that it is all there, drop the temporary reference from existing * prisons. Or add a reference to newly created persistent prisons * (which was not done earlier so that the prison would not be publicly * visible). */ if (!created) { prison_deref(pr, (flags & JAIL_ATTACH) ? PD_DEREF : PD_DEREF | PD_LIST_SLOCKED); } else { if (pr_flags & PR_PERSIST) { mtx_lock(&pr->pr_mtx); pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); } goto done_free; done_deref_locked: prison_deref(pr, created ? PD_LOCKED | PD_LIST_XLOCKED : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); goto done_releroot; done_unlock_list: sx_xunlock(&allprison_lock); done_releroot: if (root != NULL) vrele(root); done_errmsg: if (error) { if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET free(ip4, M_PRISON); #endif #ifdef INET6 free(ip6, M_PRISON); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0 && (pr->pr_uref > 0 || (flags & JAIL_DYING))) break; mtx_unlock(&pr->pr_mtx); } } if (pr != NULL) goto found_prison; error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } else if (error != ENOENT) goto done_unlock_list; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_unlock_list; } pr = prison_find_name(mypr, name); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done_unlock_list; found_prison: /* Get the parameters of the prison. */ pr->pr_ref++; locked = PD_LOCKED; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done_deref; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); if (error != 0 && error != ENOENT) goto done_deref; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); if (error != 0 && error != ENOENT) goto done_deref; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done_deref; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done_deref; for (fi = 0; fi < nitems(pr_flag_names); fi++) { if (pr_flag_names[fi] == NULL) continue; i = (pr->pr_flags & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { i = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); i = pr_flag_jailsys[fi].disable && (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < nitems(pr_allow_names); fi++) { if (pr_allow_names[fi] == NULL) continue; i = (pr->pr_allow & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } i = (pr->pr_uref == 0); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done_deref; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); locked = 0; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done_deref; prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED); /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_errmsg; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } goto done_errmsg; done_deref: prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; done_unlock_list: sx_sunlock(&allprison_lock); done_errmsg: if (error && errmsg_pos >= 0) { vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr, *cpr, *lpr, *tpr; int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } /* Remove all descendants of this prison, then remove this prison. */ pr->pr_ref++; if (!LIST_EMPTY(&pr->pr_children)) { mtx_unlock(&pr->pr_mtx); lpr = NULL; FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); if (cpr->pr_ref > 0) { tpr = cpr; cpr->pr_ref++; } else { /* Already removed - do not do it again. */ tpr = NULL; } mtx_unlock(&cpr->pr_mtx); if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } lpr = tpr; } if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } mtx_lock(&pr->pr_mtx); } prison_remove_one(pr); return (0); } static void prison_remove_one(struct prison *pr) { struct proc *p; int deuref; /* If the prison was persistent, it is not anymore. */ deuref = 0; if (pr->pr_flags & PR_PERSIST) { pr->pr_ref--; deuref = PD_DEUREF; pr->pr_flags &= ~PR_PERSIST; } /* * jail_remove added a reference. If that's the only one, remove * the prison now. */ KASSERT(pr->pr_ref > 0, ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_ref == 1) { prison_deref(pr, deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); return; } mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); /* * Kill all processes unfortunate enough to be attached to this prison. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && p->p_ucred->cr_prison == pr) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); /* * Start with exclusive hold on allprison_lock to ensure that a possible * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove. * But then immediately downgrade it since we don't need to stop * readers. */ sx_xlock(&allprison_lock); sx_downgrade(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* * Do not allow a process to attach to a prison that is not * considered to be "alive". */ if (pr->pr_uref == 0) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr)); } static int do_jail_attach(struct thread *td, struct prison *pr) { struct proc *p; struct ucred *newcred, *oldcred; int error; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED); return (error); } sx_sunlock(&allprison_lock); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root, 0); if ((error = pwd_chroot(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); setsugid(p); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); crfree(oldcred); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; again: deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) { if (pr->pr_uref > 0) return (pr); deadpr = pr; } mtx_unlock(&pr->pr_mtx); } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto again; } } return (deadpr); } /* * See if a prison has the specific flag set. */ int prison_flag(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_allow & flag); } /* * Remove a prison reference. If that was the last reference, remove the * prison itself - but not in this context in case there are locks held. */ void prison_free_locked(struct prison *pr) { int ref; mtx_assert(&pr->pr_mtx, MA_OWNED); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); if (ref == 0) taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } void prison_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_free_locked(pr); } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; sx_xlock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_deref(pr, pr->pr_uref ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED : PD_LOCKED | PD_LIST_XLOCKED); } /* * Remove a prison reference (usually). This internal version assumes no * mutexes are held, except perhaps the prison itself. If there are no more * references, release and delist the prison. On completion, the prison lock * and the allprison lock are both unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prison *ppr, *tpr; int ref, lasturef; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); for (;;) { if (flags & PD_DEUREF) { KASSERT(pr->pr_uref > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_uref--; lasturef = pr->pr_uref == 0; if (lasturef) pr->pr_ref++; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); } else lasturef = 0; if (flags & PD_DEREF) { KASSERT(pr->pr_ref > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_ref--; } ref = pr->pr_ref; mtx_unlock(&pr->pr_mtx); /* * Tell the modules if the last user reference was removed * (even it sticks around in dying state). */ if (lasturef) { if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); mtx_lock(&pr->pr_mtx); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); } /* If the prison still has references, nothing else to do. */ if (ref > 0) { if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); return; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } } else if (!(flags & PD_LIST_XLOCKED)) sx_xlock(&allprison_lock); TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); ppr = pr->pr_parent; for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount--; sx_xunlock(&allprison_lock); #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); mtx_destroy(&pr->pr_mtx); #ifdef INET free(pr->pr_ip4, M_PRISON); #endif #ifdef INET6 free(pr->pr_ip6, M_PRISON); #endif if (pr->pr_cpuset != NULL) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); #ifdef RACCT if (racct_enable) prison_racct_detach(pr); #endif free(pr, M_PRISON); /* Removing a prison frees a reference on its parent. */ pr = ppr; mtx_lock(&pr->pr_mtx); flags = PD_DEREF | PD_DEUREF; } } void prison_hold_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); KASSERT(pr->pr_ref > 0, ("Trying to hold dead prison (jid=%d).", pr->pr_id)); pr->pr_ref++; } void prison_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); } void prison_proc_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } void prison_proc_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_uref > 1) pr->pr_uref--; else { /* * Don't remove the last user reference in this context, which * is expected to be a process that is not only locked, but * also half dead. */ pr->pr_ref++; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; } mtx_unlock(&pr->pr_mtx); } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, struct sockaddr *sa) { #ifdef INET struct sockaddr_in *sai; #endif #ifdef INET6 struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return 1 if the passed credential is in a jail, otherwise 0. */ int jailed(struct ucred *cred) { return (cred->cr_prison != &prison0); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. */ case PRIV_NET80211_GETKEY: #ifdef notyet case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */ #endif #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/phyiscal memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT && cred->cr_prison->pr_enforce_statfs < 2) return (0); else return (EPERM); /* * Allow jailed root to bind reserved ports and reuse in-use * ports. */ case PRIV_NETINET_RESERVEDPORT: case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, "Jails"); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { #if defined(INET) || defined(INET6) again: #endif mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { ip4s = cpr->pr_ip4s; mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip4, ip4, cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 if (cpr->pr_ip6s > 0) { if (ip6s < cpr->pr_ip6s) { ip6s = cpr->pr_ip6s; mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip6, ip6, cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif if (cpr->pr_ref == 0) { mtx_unlock(&cpr->pr_mtx); continue; } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_uref > 0 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 xp->pr_ip6s = cpr->pr_ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns VNET?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { struct prison *pr; int allow, error, i; pr = req->td->td_ucred->cr_prison; allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; /* Get the current flag value, and convert it to a boolean. */ i = (allow & arg2) ? 1 : 0; if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the devfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the fdescfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the nullfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the procfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linprocfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linsysfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the tmpfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the zfs file system (deprecated)"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the devfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the fdescfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the nullfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the procfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linprocfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linsysfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the tmpfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the zfs file system"); #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { int old; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); old = prr->prr_refcount; if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { struct proc *p; struct ucred *cred; struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); racct_proc_ucred_changed(p, cred, cred); crfree(cred); } sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { int fi; #if defined(INET) || defined(INET6) int ii; #endif unsigned jsf; +#ifdef INET + char ip4buf[INET_ADDRSTRLEN]; +#endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (fi = 0; fi < nitems(pr_flag_names); fi++) if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi))) db_printf(" %s", pr_flag_names[fi]); for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { jsf = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name, pr_flag_jailsys[fi].disable && (jsf == pr_flag_jailsys[fi].disable) ? "disable" : (jsf == pr_flag_jailsys[fi].new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (fi = 0; fi < nitems(pr_allow_names); fi++) if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi))) db_printf(" %s", pr_allow_names[fi]); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); for (ii = 0; ii < pr->pr_ip4s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", - inet_ntoa(pr->pr_ip4[ii])); + inet_ntoa_r(pr->pr_ip4[ii], ip4buf)); #endif #ifdef INET6 db_printf(" ip6s = %d\n", pr->pr_ip6s); for (ii = 0; ii < pr->pr_ip6s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ Index: head/sys/netinet/if_ether.c =================================================================== --- head/sys/netinet/if_ether.c (revision 313820) +++ head/sys/netinet/if_ether.c (revision 313821) @@ -1,1503 +1,1511 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #define SIN(s) ((const struct sockaddr_in *)(s)) static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; /* Simple ARP state machine */ enum arp_llinfo_state { ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */ ARP_LLINFO_REACHABLE, /* LLE is valid */ ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */ ARP_LLINFO_DELETED, /* LLE is deleted */ }; SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ static VNET_DEFINE(int, arp_maxtries) = 5; static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ static VNET_DEFINE(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arpt_rexmit VNET(arpt_rexmit) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); /* * Due to the exponential backoff algorithm used for the interval between GARP * retransmissions, the maximum number of retransmissions is limited for * sanity. This limit corresponds to a maximum interval between retransmissions * of 2^16 seconds ~= 18 hours. * * Making this limit more dynamic is more complicated than worthwhile, * especially since sending out GARPs spaced days apart would be of little * use. A maximum dynamic limit would look something like: * * const int max = fls(INT_MAX / hz) - 1; */ #define MAX_GARP_RETRANSMITS 16 static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS); static int garp_rexmit_count = 0; /* GARP retransmission setting. */ SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &garp_rexmit_count, 0, sysctl_garp_rexmit, "I", "Number of times to retransmit GARP packets;" " 0 to disable, maximum of 16"); #define ARP_LOG(pri, ...) do { \ if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la); static void arp_mark_lle_reachable(struct llentry *la); static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); static eventhandler_tag iflladdr_tag; static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; int r_skip_req; if (lle->la_flags & LLE_STATIC) { return; } LLE_WLOCK(lle); if (callout_pending(&lle->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by arpresolve() below. */ LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); switch (lle->ln_state) { case ARP_LLINFO_REACHABLE: /* * Expiration time is approaching. * Let's try to refresh entry if it is still * in use. * * Set r_skip_req to get feedback from * fast path. Change state and re-schedule * ourselves. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); lle->ln_state = ARP_LLINFO_VERIFY; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; case ARP_LLINFO_VERIFY: LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; LLE_REQ_UNLOCK(lle); if (r_skip_req == 0 && lle->la_preempt > 0) { /* Entry was used, issue refresh request */ struct in_addr dst; dst = lle->r_l3addr.addr4; lle->la_preempt--; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); arprequest(ifp, NULL, &dst, NULL); CURVNET_RESTORE(); return; } /* Nothing happened. Reschedule if not too late */ if (lle->la_expire > time_uptime) { callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; } break; case ARP_LLINFO_INCOMPLETE: case ARP_LLINFO_DELETED: break; } if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->lle_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { LLE_REMREF(lle); lltable_unlink_entry(lle->lle_tbl, lle); } IF_AFDATA_UNLOCK(ifp); size_t pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Stores link-layer header for @ifp in format suitable for if_output() * into buffer @buf. Resulting header length is stored in @bufsize. * * Returns 0 on success. */ static int arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, size_t *bufsize) { struct if_encap_req ereq; int error; bzero(buf, *bufsize); bzero(&ereq, sizeof(ereq)); ereq.buf = buf; ereq.bufsize = *bufsize; ereq.rtype = IFENCAP_LL; ereq.family = AF_ARP; ereq.lladdr = ar_tha(ah); ereq.hdata = (u_char *)ah; if (bcast) ereq.flags = IFENCAP_FLAG_BROADCAST; error = ifp->if_requestencap(ifp, &ereq); if (error == 0) *bufsize = ereq.bufsize; return (error); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; struct route ro; int error; if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } IF_ADDR_RUNLOCK(ifp); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); } /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). * We have already checked than we can't use existing lle without * modification so we have to acquire LLE_EXCLUSIVE lle lock. * * On success, desten and flags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ static int arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; char *lladdr; int ll_len; if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; if ((flags & LLE_CREATE) == 0) { IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); } if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { + char addrbuf[INET_ADDRSTRLEN]; + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", - inet_ntoa(SIN(dst)->sin_addr), if_name(ifp)); + inet_ntoa_r(SIN(dst)->sin_addr, addrbuf), + if_name(ifp)); m_freem(m); return (EINVAL); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* Prefer ANY existing lle over newly-created one */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; } } if (la == NULL) { m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { if (flags & LLE_ADDRONLY) { lladdr = la->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = la->r_linkdata; ll_len = la->r_hdrlen; } bcopy(lladdr, desten, ll_len); /* Check if we have feedback request from arptimer() */ if (la->r_skip_req != 0) { LLE_REQ_LOCK(la); la->r_skip_req = 0; /* Notify that entry was used */ LLE_REQ_UNLOCK(la); } if (pflags != NULL) *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR); if (plle) { LLE_ADDREF(la); *plle = la; } LLE_WUNLOCK(la); return (0); } renew = (la->la_asked == 0 || la->la_expire != time_uptime); /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->lle_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } LLE_WUNLOCK(la); return (error); } /* * Resolve an IP address into an ethernet address. */ int arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags, struct llentry **plle) { int error; flags |= LLE_ADDRONLY; error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags, plle); return (error); } /* * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is the storage to put LL header. * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR * * On success, full/partial link header and flags are filled in and * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *la = NULL; if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(la->r_linkdata, desten, la->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); /* Check if we have feedback request from arptimer() */ if (la->r_skip_req != 0) { LLE_REQ_LOCK(la); la->r_skip_req = 0; /* Notify that entry was used */ LLE_REQ_UNLOCK(la); } if (plle) { LLE_ADDREF(la); *plle = la; LLE_WUNLOCK(la); } IF_AFDATA_RUNLOCK(ifp); return (0); } if (plle && la) LLE_WUNLOCK(la); IF_AFDATA_RUNLOCK(ifp); return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, desten, pflags, plle)); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; struct ifnet *ifp; char *layer; int hlen; ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); /* Check if length is sufficient */ if (m->m_len < arphdr_len(ar)) { m = m_pullup(m, arphdr_len(ar)); if (m == NULL) { ARP_LOG(LOG_NOTICE, "short packet received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); } hlen = 0; layer = ""; switch (ntohs(ar->ar_hrd)) { case ARPHRD_ETHER: hlen = ETHER_ADDR_LEN; /* RFC 826 */ layer = "ethernet"; break; case ARPHRD_IEEE802: hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */ layer = "fddi"; break; case ARPHRD_ARCNET: hlen = 1; /* RFC 1201, ARC_ADDR_LEN */ layer = "arcnet"; break; case ARPHRD_INFINIBAND: hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ layer = "infiniband"; break; case ARPHRD_IEEE1394: hlen = 0; /* SHALL be 16 */ /* RFC 2734 */ layer = "firewire"; /* * Restrict too long hardware addresses. * Currently we are capable of handling 20-byte * addresses ( sizeof(lle->ll_addr) ) */ if (ar->ar_hln >= 20) hlen = 16; break; default: ARP_LOG(LOG_NOTICE, "packet with unknown hardware format 0x%02d received on " "%s\n", ntohs(ar->ar_hrd), if_name(ifp)); m_freem(m); return; } if (hlen != 0 && hlen != ar->ar_hln) { ARP_LOG(LOG_NOTICE, "packet with invalid %s address length %d received on %s\n", layer, ar->ar_hln, if_name(ifp)); m_freem(m); return; } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); static void in_arpinput(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL, *la_tmp; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; struct sockaddr *dst; struct nhop4_basic nh4; uint8_t linkhdr[LLE_MAX_LINKHDR]; struct route ro; size_t linkhdrsize; int lladdr_off; int error; + char addrbuf[INET_ADDRSTRLEN]; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * We already have checked that mbuf contains enough contiguous data * to hold entire arp message according to the arp header. */ ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); /* * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " - "%s!\n", inet_ntoa(isaddr)); + "%s!\n", inet_ntoa_r(isaddr, addrbuf)); goto drop; } if (ifp->if_addrlen != ah->ar_hln) { ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - inet_ntoa(isaddr), ifp->if_xname); + inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; dst = (struct sockaddr *)&sin; IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { /* * Request/reply to our address, but no lle exists yet. * Calculate full link prepend to use in lle. */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) goto reply; /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { /* * lle creation may fail if source address belongs * to non-directly connected subnet. However, we * will try to answer the request instead of dropping * frame. */ goto reply; } lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* * Check if lle still does not exists. * If it does, that means that we either * 1) have configured it explicitly, via * 1a) 'arp -s' static entry or * 1b) interface address static record * or * 2) it was the result of sending first packet to-host * or * 3) it was another arp reply packet we handled in * different thread. * * In all cases except 3) we definitely need to prefer * existing lle. For the sake of simplicity, prefer any * existing lle over newly-create one. */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp == NULL) { arp_mark_lle_reachable(la); LLE_WUNLOCK(la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; la_tmp = NULL; arp_check_update_lle(ah, isaddr, ifp, bridged, la); /* arp_check_update_lle() returns @la unlocked */ } la = NULL; } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { struct llentry *lle = NULL; sin.sin_addr = itaddr; IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_RUNLOCK(ifp); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; /* XXX MRT use table 0 for arp reply */ if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (nh4.nh_ifp == ifp) goto drop; (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ /* XXX MRT use table 0 for arp checks */ if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0) goto drop; if (nh4.nh_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s\n", - inet_ntoa(isaddr), ifp->if_xname); + inet_ntoa_r(isaddr, addrbuf), + ifp->if_xname); goto drop; } #ifdef DEBUG_PROXY - printf("arp: proxying for %s\n", inet_ntoa(itaddr)); + printf("arp: proxying for %s\n", + inet_ntoa_r(itaddr, addrbuf)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", - inet_ntoa(itaddr)); + inet_ntoa_r(itaddr, addrbuf)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); /* * arp_fillheader() may fail due to lack of support inside encap request * routing. This is not necessary an error, AF_ARP can/should be handled * by if_output(). */ if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif /* * Checks received arp data against existing @la. * Updates lle state/performs notification if necessary. */ static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la) { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; + char addrbuf[INET_ADDRSTRLEN]; LLE_WLOCK_ASSERT(la); /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", - inet_ntoa(isaddr), + inet_ntoa_r(isaddr, addrbuf), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); return; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - inet_ntoa(isaddr), ifp->if_xname); + inet_ntoa_r(isaddr, addrbuf), + ifp->if_xname); return; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", - inet_ntoa(isaddr), + inet_ntoa_r(isaddr, addrbuf), ifp->if_addrlen, (u_char *)&la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } /* Calculate full link prepend to use in lle */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) return; /* Check if something has changed */ if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { /* Try to perform LLE update */ if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off) == 0) return; /* Clear fast path feedback request if set */ la->r_skip_req = 0; } arp_mark_lle_reachable(la); /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; lltable_fill_sa_entry(la, &sa); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } static void arp_mark_lle_reachable(struct llentry *la) { int canceled, wtime; LLE_WLOCK_ASSERT(la); la->ln_state = ARP_LLINFO_REACHABLE; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit; if (wtime < 0) wtime = V_arpt_keep; canceled = callout_reset(&la->lle_timer, hz * wtime, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; } /* * Add pernament link-layer record for given interface address. */ static __noinline void arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst) { struct llentry *lle, *lle_tmp; /* * Interface address LLE record is considered static * because kernel code relies on LLE_STATIC flag to check * if these entries can be rewriten by arp updates. */ lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst); if (lle == NULL) { log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); return; } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Unlink any entry if exists */ lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) lltable_unlink_entry(LLTABLE(ifp), lle_tmp); lltable_link_entry(LLTABLE(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); if (lle_tmp != NULL) lltable_free_entry(LLTABLE(ifp), lle_tmp); } /* * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range * of valid values. */ static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS) { int error; int rexmit_count = *(int *)arg1; error = sysctl_handle_int(oidp, &rexmit_count, 0, req); /* Enforce limits on any new value that may have been set. */ if (!error && req->newptr) { /* A new value was set. */ if (rexmit_count < 0) { rexmit_count = 0; } else if (rexmit_count > MAX_GARP_RETRANSMITS) { rexmit_count = MAX_GARP_RETRANSMITS; } *(int *)arg1 = rexmit_count; } return (error); } /* * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to * retransmit it again. A pending callout owns a reference to the ifa. */ static void garp_rexmit(void *arg) { struct in_ifaddr *ia = arg; if (callout_pending(&ia->ia_garp_timer) || !callout_active(&ia->ia_garp_timer)) { IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); ifa_free(&ia->ia_ifa); return; } /* * Drop lock while the ARP request is generated. */ IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr, &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp)); /* * Increment the count of retransmissions. If the count has reached the * maximum value, stop sending the GARP packets. Otherwise, schedule * the callout to retransmit another GARP packet. */ ++ia->ia_garp_count; if (ia->ia_garp_count >= garp_rexmit_count) { ifa_free(&ia->ia_ifa); } else { int rescheduled; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); rescheduled = callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia); IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); if (rescheduled) { ifa_free(&ia->ia_ifa); } } } /* * Start the GARP retransmit timer. * * A single GARP is always transmitted when an IPv4 address is added * to an interface and that is usually sufficient. However, in some * circumstances, such as when a shared address is passed between * cluster nodes, this single GARP may occasionally be dropped or * lost. This can lead to neighbors on the network link working with a * stale ARP cache and sending packets destined for that address to * the node that previously owned the address, which may not respond. * * To avoid this situation, GARP retransmits can be enabled by setting * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater * than zero. The setting represents the maximum number of * retransmissions. The interval between retransmissions is calculated * using an exponential backoff algorithm, doubling each time, so the * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds). */ static void garp_timer_start(struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *) ifa; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); ia->ia_garp_count = 0; if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia) == 0) { ifa_ref(ifa); } IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { const struct sockaddr_in *dst_in; const struct sockaddr *dst; if (ifa->ifa_carp != NULL) return; dst = ifa->ifa_addr; dst_in = (const struct sockaddr_in *)dst; if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY) return; arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); if (garp_rexmit_count > 0) { garp_timer_start(ifa); } arp_add_ifa_lle(ifp, dst); } void arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr) { if (ntohl(addr.s_addr) != INADDR_ANY) arprequest(ifp, &addr, &addr, enaddr); } /* * Sends gratuitous ARPs for each ifaddr to notify other * nodes about the address change. */ static __noinline void arp_handle_ifllchange(struct ifnet *ifp) { struct ifaddr *ifa; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } } /* * A handler for interface link layer address change event. */ static void arp_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE(ifp)); if ((ifp->if_flags & IFF_UP) != 0) arp_handle_ifllchange(ifp); } static void vnet_arp_init(void) { if (IS_DEFAULT_VNET(curvnet)) { netisr_register(&arp_nh); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE else netisr_register_vnet(&arp_nh); #endif } VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, vnet_arp_init, 0); #ifdef VIMAGE /* * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. */ static void vnet_arp_destroy(__unused void *arg) { netisr_unregister_vnet(&arp_nh); } VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_arp_destroy, NULL); #endif Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 313820) +++ head/sys/netinet/igmp.c (revision 313821) @@ -1,3678 +1,3722 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } #ifdef KTR static __inline char * -inet_ntoa_haddr(in_addr_t haddr) +inet_ntoa_haddr(in_addr_t haddr, char *addrbuf) { struct in_addr ia; ia.s_addr = htonl(haddr); - return (inet_ntoa(ia)); + return (inet_ntoa_r(ia, addrbuf)); } #endif /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = (char)IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; SLIST_INIT(&igi->igi_relinmhead); mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) { SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } inm_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); /* * Free the in_multi reference(s) for this IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), ("%s: there are dangling in_multi references", __func__)); free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, + ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, - inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); + inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp->if_xname, timer); IN_MULTI_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", - inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmpv3->igmp_group, addrbuf), ifp, + ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, + ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, + ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", - inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + inet_ntoa_r(igmp->igmp_group, addrbuf), ifp, + ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; IN_MULTI_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(igi, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { struct in_multi *tinm; igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, - inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inet_ntoa_r(inm->inm_addr, addrbuf), + inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; IN_MULTI_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; - +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif + CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", - __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + __func__, inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) inm_release_locked(inm); inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, - inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inet_ntoa_r(inm->inm_addr, addrbuf), + inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", - __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + __func__, inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, - inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inet_ntoa_r(inm->inm_addr, addrbuf), + inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", - __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + __func__, inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, - inet_ntoa(inm->inm_addr), + inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, - inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inet_ntoa_r(inm->inm_addr, addrbuf), + inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", - __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + __func__, inet_ntoa_r(inm->inm_addr, addrbuf), + inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif IN_MULTI_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", - __func__, inet_ntoa(inm->inm_addr), + __func__, inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, - igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), + igmp_rec_type_to_str(type), inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, - inet_ntoa_haddr(ims->ims_haddr)); + inet_ntoa_haddr(ims->ims_haddr, addrbuf)); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, - inet_ntoa_haddr(ims->ims_haddr)); + inet_ntoa_haddr(ims->ims_haddr, addrbuf)); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif IN_MULTI_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", - __func__, inet_ntoa_haddr(ims->ims_haddr)); + __func__, + inet_ntoa_haddr(ims->ims_haddr, addrbuf)); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif #ifdef VIMAGE static void vnet_igmp_init(const void *unused __unused) { netisr_register_vnet(&igmp_nh); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister_vnet(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); /* SLIST_HEAD(,in_multi) igi_relinmhead */ /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 313820) +++ head/sys/netinet/in.c (revision 313821) @@ -1,1494 +1,1497 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { register u_long i = ntohl(in.s_addr); register u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { register char *cplim = (char *) &ap->sin_addr; register char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, (cmd == SIOCDIFADDR) ? false : true); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); IN_MULTI_LOCK(); if (ii->ii_allhosts) { (void)in_leavegroup_locked(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } IN_MULTI_UNLOCK(); } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_RLOCK(ifp); */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* IF_ADDR_RUNLOCK(ifp); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; int found; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } IF_ADDR_RUNLOCK(ifp); return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in_multi) purgeinms; struct in_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN_MULTI_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, inm_link); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { LIST_REMOVE(inm, inm_link); inm_release_locked(inm); } igmp_ifdetach(ifp); IN_MULTI_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(struct llentry *lle) { LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); in_lltable_destroy_lle_unlocked(lle); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } /* cancel timer */ if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; - int len; + const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; - len = ((const struct sockaddr_in *)l3addr)->sin_len; - lim = addr + len; + l3sin = (const struct sockaddr_in *)l3addr; + lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC - log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", - inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); + char addrbuf[INET_ADDRSTRLEN]; + + log(LOG_INFO, "IPv4 address: \"%s\" " + "is not on the network\n", + inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in_lltable_destroy_lle_unlocked(lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 313820) +++ head/sys/netinet/in_mcast.c (revision 313821) @@ -1,3009 +1,3041 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static void inp_freemoptions_internal(struct ip_moptions *); static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); static STAILQ_HEAD(, ip_moptions) imo_gc_list = STAILQ_HEAD_INITIALIZER(imo_gc_list); static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET) { inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; IN_MULTI_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); ++inm->inm_refcount; *pinm = inm; return (0); } memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || - !in_hosteq(inm->inm_addr, *group)) + !in_hosteq(inm->inm_addr, *group)) { + char addrbuf[INET_ADDRSTRLEN]; + panic("%s: ifma %p is inconsistent with %p (%s)", - __func__, ifma, inm, inet_ntoa(*group)); + __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); + } #endif ++inm->inm_refcount; *pinm = inm; IF_ADDR_WUNLOCK(ifp); return (0); } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; IF_ADDR_WUNLOCK(ifp); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ void inm_release_locked(struct in_multi *inm) { struct ifmultiaddr *ifma; IN_MULTI_LOCK_ASSERT(); CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); if (--inm->inm_refcount > 0) { CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, inm->inm_refcount); return; } CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == inm, ("%s: ifma_protospec != inm", __func__)); ifma->ifma_protospec = NULL; inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma(ifma); } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; #ifdef KTR - struct in_addr ia; + struct in_addr ia; + char addrbuf[INET_ADDRSTRLEN]; #endif find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR ia.s_addr = htonl(haddr); CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__, - inet_ntoa(ia), ims); + inet_ntoa_r(ia, addrbuf), ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; struct in_addr ia; ia.s_addr = htonl(ims->ims_haddr); #endif if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s", - __func__, n, inet_ntoa(ia)); + __func__, n, inet_ntoa_r(ia, addrbuf)); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s", - __func__, n, inet_ntoa(ia)); + __func__, n, inet_ntoa_r(ia, addrbuf)); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s", - __func__, n, inet_ntoa(ia)); + __func__, n, inet_ntoa_r(ia, addrbuf)); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s", - __func__, n, inet_ntoa(ia)); + __func__, n, inet_ntoa_r(ia, addrbuf)); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif IN_MULTI_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__, - inet_ntoa(*gina), ifp, ifp->if_xname); + inet_ntoa_r(*gina, addrbuf), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_locked(inm); } else { *pinm = inm; } return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif error = 0; IN_MULTI_LOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__, - inm, inet_ntoa(inm->inm_addr), + inm, inet_ntoa_r(inm->inm_addr, addrbuf), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_locked(inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; +#ifdef INVARIANTS + char addrbuf[INET_ADDRSTRLEN]; +#endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), - ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap))); + ("%s: %s not in 224.0.0.0/24", __func__, + inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. * This KPI is for legacy kernel consumers only. */ void in_delmulti(struct in_multi *inm) { (void)in_leavegroup(inm, NULL); } /*#endif*/ /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", - __func__, inet_ntoa(mreqs.imr_interface), ifp); + __func__, inet_ntoa_r(mreqs.imr_interface, addrbuf), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, - inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not "); + inet_ntoa_r(ssa->sin.sin_addr, addrbuf), + doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is performed asynchronously in a separate task. * * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo) { KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); IN_MULTI_LOCK(); STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); IN_MULTI_UNLOCK(); taskqueue_enqueue(taskqueue_thread, &imo_gc_task); } static void inp_freemoptions_internal(struct ip_moptions *imo) { struct in_mfilter *imf; size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); (void)in_leavegroup(imo->imo_membership[idx], imf); if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } static void inp_gcmoptions(void *context, int pending) { struct ip_moptions *imo; IN_MULTI_LOCK(); while (!STAILQ_EMPTY(&imo_gc_list)) { imo = STAILQ_FIRST(&imo_gc_list); STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); IN_MULTI_UNLOCK(); inp_freemoptions_internal(imo); IN_MULTI_LOCK(); } IN_MULTI_UNLOCK(); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); } } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", - __func__, inet_ntoa(mreqs.imr_interface), ifp); + __func__, inet_ntoa_r(mreqs.imr_interface, addrbuf), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); IN_MULTI_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); goto out_in_multi_locked; } } out_in_multi_locked: IN_MULTI_UNLOCK(); INP_WLOCK_ASSERT(inp); if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", - __func__, inet_ntoa(mreqs.imr_interface), ifp); + __func__, inet_ntoa_r(mreqs.imr_interface, addrbuf), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, - inet_ntoa(ssa->sin.sin_addr), "not "); + inet_ntoa_r(ssa->sin.sin_addr, addrbuf), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } out_in_multi_locked: IN_MULTI_UNLOCK(); if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; } imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp, - inet_ntoa(addr)); + inet_ntoa_r(addr, addrbuf)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group %s is not multicast", - __func__, inet_ntoa(group)); + __func__, inet_ntoa_r(group, addrbuf)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); IN_MULTI_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { #ifdef KTR struct in_addr ina; ina.s_addr = htonl(ims->ims_haddr); CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, - inet_ntoa(ina)); + inet_ntoa_r(ina, addrbuf)); #endif /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); IN_MULTI_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; + char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", - inet_ntoa(inm->inm_addr), + inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/ip_icmp.c =================================================================== --- head/sys/netinet/ip_icmp.c (revision 313820) +++ head/sys/netinet/ip_icmp.c (revision 313821) @@ -1,1031 +1,1036 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static VNET_DEFINE(int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); static VNET_DEFINE(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets"); static VNET_DEFINE(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets"); VNET_DEFINE(int, drop_redirect) = 0; #define V_drop_redirect VNET(drop_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); static VNET_DEFINE(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "ICMP reply source for non-local packets"); static VNET_DEFINE(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); static VNET_DEFINE(int, icmp_quotelen) = 8; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); static VNET_DEFINE(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, "Reply to multicast ICMP Echo Request and Timestamp packets"); static VNET_DEFINE(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen, icmpelen, nlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ if (oiphlen + 8 > n->m_len) goto freeit; /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunately this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)) goto freeit; th = (struct tcphdr *)((caddr_t)oip + oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) goto freeit; icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else if (oip->ip_p == IPPROTO_SCTP) { struct sctphdr *sh; struct sctp_chunkhdr *ch; if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr)) goto stdreply; if (oiphlen + sizeof(struct sctphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct sctphdr) && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL) goto freeit; icmpelen = max(sizeof(struct sctphdr), min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); sh = (struct sctphdr *)((caddr_t)oip + oiphlen); if (ntohl(sh->v_tag) == 0 && ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 && (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 || n->m_next != NULL)) { if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL) goto freeit; ch = (struct sctp_chunkhdr *)(sh + 1); if (ch->chunk_type == SCTP_INITIATION) { icmpelen = max(sizeof(struct sctphdr) + 8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } } } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, ICMP_MINLEN + icmplen); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); ICMPSTAT_INC(icps_outhist[type]); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(ip->ip_src)); + char srcbuf[INET_ADDRSTRLEN]; + char dstbuf[INET_ADDRSTRLEN]; + printf("icmp_input from %s to %s, len %d\n", - buf, inet_ntoa(ip->ip_dst), icmplen); + inet_ntoa_r(ip->ip_src, srcbuf), + inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; ICMPSTAT_INC(icps_inhist[icp->icmp_type]); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: code = PRC_UNREACH_PROTOCOL; break; case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp)); ip_stripoptions(m); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); icp = (struct icmp *)(ip + 1); /* * The upper layer handler can rely on: * - The outer IP header has no options. * - The outer IP header, the ICMP header, the inner IP header, * and the first n bytes of the inner payload are contiguous. * n is at least 8, but might be larger based on * ICMP_ADVLENPREF. See its definition in ip_icmp.h. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; else goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; else goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; if (ia->ia_ifp == NULL) { ifa_free(&ia->ia_ifa); break; } icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } ifa_free(&ia->ia_ifa); reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + char dstbuf[INET_ADDRSTRLEN]; + char gwbuf[INET_ADDRSTRLEN]; printf("redirect dst %s to %s\n", - buf, inet_ntoa(icp->icmp_gwaddr)); + inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf), + inet_ntoa_r(icp->icmp_gwaddr, gwbuf)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_SOURCEQUENCH: default: break; } raw: *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct nhop4_extended nh_ext; struct mbuf *opts = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = nh_ext.nh_src; match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == NULL && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { register struct ip *ip = mtod(m, struct ip *); register int hlen; register struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(ip->ip_dst)); + char dstbuf[INET_ADDRSTRLEN]; + char srcbuf[INET_ADDRSTRLEN]; + printf("icmp_send dst %s src %s\n", - buf, inet_ntoa(ip->ip_src)); + inet_ntoa_r(ip->ip_dst, dstbuf), + inet_ntoa_r(ip->ip_src, srcbuf)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ struct icmp_rate { const char *descr; struct counter_rate cr; }; static VNET_DEFINE(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" }, { "sctp ootb response" } }; #define V_icmp_rates VNET(icmp_rates) static void icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { V_icmp_rates[i].cr.cr_rate = counter_u64_alloc(M_WAITOK); V_icmp_rates[i].cr.cr_ticks = ticks; } } VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp_bandlimit_init, NULL); static void icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) counter_u64_free(V_icmp_rates[i].cr.cr_rate); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); int badport_bandlim(int which) { int64_t pps; if (V_icmplim == 0 || which == BANDLIM_UNLIMITED) return (0); KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); pps = counter_ratecheck(&V_icmp_rates[which].cr, V_icmplim); if (pps == -1) return (-1); if (pps > 0 && V_icmplim_output) log(LOG_NOTICE, "Limiting %s from %jd to %d packets/sec\n", V_icmp_rates[which].descr, (intmax_t )pps, V_icmplim); return (0); } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 313820) +++ head/sys/netinet/ip_mroute.c (revision 313821) @@ -1,2949 +1,2966 @@ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; static VNET_DEFINE(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) static VNET_DEFINE(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) static VNET_DEFINE(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); extern struct domain inetdomain; static const struct protosw in_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) static VNET_DEFINE(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static VNET_DEFINE(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) static VNET_DEFINE(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; ifa_free(ifa); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__, - (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr), + (int)vifcp->vifc_vifi, inet_ntoa_r(vifcp->vifc_lcl_addr, addrbuf), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x", - __func__, inet_ntoa(mfccp->mfcc_origin), + __func__, inet_ntoa_r(mfccp->mfcc_origin, addrbuf), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig %s group %lx parent %x qh %p", - __func__, inet_ntoa(mfccp->mfcc_origin), + __func__, inet_ntoa_r(mfccp->mfcc_origin, addrbuf), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__, - inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr)); + inet_ntoa_r(origin, addrbuf), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p", - inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp); + inet_ntoa_r(ip->ip_src, addrbuf), (u_long)ntohl(ip->ip_dst.s_addr), + ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)", - inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr)); + inet_ntoa_r(ip->ip_src, addrbuf), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copym(mb0, 0, hlen, M_NOWAIT); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ int pim_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; int iphlen = *offp; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; - +#ifdef KTR + char addrbuf[INET_ADDRSTRLEN]; +#endif + *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from %s", - __func__, datalen, inet_ntoa(ip->ip_src)); + __func__, datalen, inet_ntoa_r(ip->ip_src, addrbuf)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d", - __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len)); + __func__, inet_ntoa_r(encap_ip->ip_src, addrbuf), + ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, - inet_ntoa(encap_ip->ip_dst)); + inet_ntoa_r(encap_ip->ip_dst, addrbuf)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copym() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { FREE(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, pim_encapcheck, &in_pim_protosw, NULL); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/ip_options.c =================================================================== --- head/sys/netinet/ip_options.c (revision 313820) +++ head/sys/netinet/ip_options.c (revision 313821) @@ -1,757 +1,760 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static VNET_DEFINE(int, ip_dosourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, "Enable forwarding source routed IP packets"); #define V_ip_dosourceroute VNET(ip_dosourceroute) static VNET_DEFINE(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); #define V_ip_acceptsourceroute VNET(ip_acceptsourceroute) VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; } dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (V_ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!V_ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!V_ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (V_ipstealth) goto dropit; #endif if (!V_ip_dosourceroute) { if (V_ipforwarding) { - char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + char srcbuf[INET_ADDRSTRLEN]; + char dstbuf[INET_ADDRSTRLEN]; + /* * Acting as a router, so generate * ICMP */ nosourcerouting: - strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_WARNING, - "attempted source route from %s to %s\n", - inet_ntoa(ip->ip_src), buf); + "attempted source route from %s " + "to %s\n", + inet_ntoa_r(ip->ip_src, srcbuf), + inet_ntoa_r(ip->ip_dst, dstbuf)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, RT_ALL_FIBS); if (ia == NULL) ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, RT_ALL_FIBS); if (ia == NULL) goto bad; memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); } else { /* XXX MRT 0 for routing */ if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) != 0) goto bad; memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } ip->ip_dst = ipaddr.sin_addr; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); } else if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } else { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); cp[IPOPT_OFFSET] += sizeof(uint32_t); } } if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. */ void ip_stripoptions(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); int olen; olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), (size_t )(m->m_len - sizeof(struct ip))); } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = NULL; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * Check for the presence of the IP Router Alert option [RFC2113] * in the header of an IPv4 datagram. * * This call is not intended for use from the forwarding path; it is here * so that protocol domains may check for the presence of the option. * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert * option does not have much relevance to the implementation, though this * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. * Length checks on individual options should already have been performed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. */ int ip_checkrouteralert(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); u_char *cp; int opt, optlen, cnt, found_ra; found_ra = 0; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { #ifdef INVARIANTS if (cnt < IPOPT_OLEN + sizeof(*cp)) break; #endif optlen = cp[IPOPT_OLEN]; #ifdef INVARIANTS if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) break; #endif } switch (opt) { case IPOPT_RA: #ifdef INVARIANTS if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) break; else #endif found_ra = 1; break; default: break; } } return (found_ra); } Index: head/sys/netinet/libalias/alias_local.h =================================================================== --- head/sys/netinet/libalias/alias_local.h (revision 313820) +++ head/sys/netinet/libalias/alias_local.h (revision 313821) @@ -1,404 +1,410 @@ /*- * Copyright (c) 2001 Charles Mott * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Alias_local.h contains the function prototypes for alias.c, * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well * as any future add-ons). It also includes macros, globals and * struct definitions shared by more than one alias*.c file. * * This include file is intended to be used only within the aliasing * software. Outside world interfaces are defined in alias.h * * This software is placed into the public domain with no restrictions * on its distribution. * * Initial version: August, 1996 (cjm) * * */ #ifndef _ALIAS_LOCAL_H_ #define _ALIAS_LOCAL_H_ #include #include #ifdef _KERNEL #include #include #include #include /* XXX: LibAliasSetTarget() uses this constant. */ #define INADDR_NONE 0xffffffff #include #else #include "alias_sctp.h" #endif /* Sizes of input and output link tables */ #define LINK_TABLE_OUT_SIZE 4001 #define LINK_TABLE_IN_SIZE 4001 #define GET_ALIAS_PORT -1 #define GET_ALIAS_ID GET_ALIAS_PORT +#ifdef _KERNEL +#define INET_NTOA_BUF(buf) (buf) +#else +#define INET_NTOA_BUF(buf) (buf), sizeof(buf) +#endif + struct proxy_entry; struct libalias { LIST_ENTRY(libalias) instancelist; int packetAliasMode; /* Mode flags */ /* - documented in alias.h */ struct in_addr aliasAddress; /* Address written onto source */ /* field of IP packet. */ struct in_addr targetAddress; /* IP address incoming packets */ /* are sent to if no aliasing */ /* link already exists */ struct in_addr nullAddress; /* Used as a dummy parameter for */ /* some function calls */ LIST_HEAD (, alias_link) linkTableOut[LINK_TABLE_OUT_SIZE]; /* Lookup table of pointers to */ /* chains of link records. Each */ LIST_HEAD (, alias_link) linkTableIn[LINK_TABLE_IN_SIZE]; /* link record is doubly indexed */ /* into input and output lookup */ /* tables. */ /* Link statistics */ int icmpLinkCount; int udpLinkCount; int tcpLinkCount; int pptpLinkCount; int protoLinkCount; int fragmentIdLinkCount; int fragmentPtrLinkCount; int sockCount; int cleanupIndex; /* Index to chain of link table */ /* being inspected for old links */ int timeStamp; /* System time in seconds for */ /* current packet */ int lastCleanupTime; /* Last time * IncrementalCleanup() */ /* was called */ int deleteAllLinks; /* If equal to zero, DeleteLink() */ /* will not remove permanent links */ /* log descriptor */ #ifdef _KERNEL char *logDesc; #else FILE *logDesc; #endif /* statistics monitoring */ int newDefaultLink; /* Indicates if a new aliasing */ /* link has been created after a */ /* call to PacketAliasIn/Out(). */ #ifndef NO_FW_PUNCH int fireWallFD; /* File descriptor to be able to */ /* control firewall. Opened by */ /* PacketAliasSetMode on first */ /* setting the PKT_ALIAS_PUNCH_FW */ /* flag. */ int fireWallBaseNum; /* The first firewall entry * free for our use */ int fireWallNumNums; /* How many entries can we * use? */ int fireWallActiveNum; /* Which entry did we last * use? */ char *fireWallField; /* bool array for entries */ #endif unsigned int skinnyPort; /* TCP port used by the Skinny */ /* protocol. */ struct proxy_entry *proxyList; struct in_addr true_addr; /* in network byte order. */ u_short true_port; /* in host byte order. */ /* * sctp code support */ /* counts associations that have progressed to UP and not yet removed */ int sctpLinkCount; #ifdef _KERNEL /* timing queue for keeping track of association timeouts */ struct sctp_nat_timer sctpNatTimer; /* size of hash table used in this instance */ u_int sctpNatTableSize; /* * local look up table sorted by l_vtag/l_port */ LIST_HEAD(sctpNatTableL, sctp_nat_assoc) *sctpTableLocal; /* * global look up table sorted by g_vtag/g_port */ LIST_HEAD(sctpNatTableG, sctp_nat_assoc) *sctpTableGlobal; /* * avoid races in libalias: every public function has to use it. */ struct mtx mutex; #endif }; /* Macros */ #ifdef _KERNEL #define LIBALIAS_LOCK_INIT(l) \ mtx_init(&l->mutex, "per-instance libalias mutex", NULL, MTX_DEF) #define LIBALIAS_LOCK_ASSERT(l) mtx_assert(&l->mutex, MA_OWNED) #define LIBALIAS_LOCK(l) mtx_lock(&l->mutex) #define LIBALIAS_UNLOCK(l) mtx_unlock(&l->mutex) #define LIBALIAS_LOCK_DESTROY(l) mtx_destroy(&l->mutex) #else #define LIBALIAS_LOCK_INIT(l) #define LIBALIAS_LOCK_ASSERT(l) #define LIBALIAS_LOCK(l) #define LIBALIAS_UNLOCK(l) #define LIBALIAS_LOCK_DESTROY(l) #endif /* * The following macro is used to update an * internet checksum. "delta" is a 32-bit * accumulation of all the changes to the * checksum (adding in new 16-bit words and * subtracting out old words), and "cksum" * is the checksum value to be updated. */ #define ADJUST_CHECKSUM(acc, cksum) \ do { \ acc += cksum; \ if (acc < 0) { \ acc = -acc; \ acc = (acc >> 16) + (acc & 0xffff); \ acc += acc >> 16; \ cksum = (u_short) ~acc; \ } else { \ acc = (acc >> 16) + (acc & 0xffff); \ acc += acc >> 16; \ cksum = (u_short) acc; \ } \ } while (0) /* Prototypes */ /* * SctpFunction prototypes * */ void AliasSctpInit(struct libalias *la); void AliasSctpTerm(struct libalias *la); int SctpAlias(struct libalias *la, struct ip *ip, int direction); /* * We do not calculate TCP checksums when libalias is a kernel * module, since it has no idea about checksum offloading. * If TCP data has changed, then we just set checksum to zero, * and caller must recalculate it himself. * In case if libalias will edit UDP data, the same approach * should be used. */ #ifndef _KERNEL u_short IpChecksum(struct ip *_pip); u_short TcpChecksum(struct ip *_pip); #endif void DifferentialChecksum(u_short * _cksum, void * _new, void * _old, int _n); /* Internal data access */ struct alias_link * AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, int alias_param, int link_type); struct alias_link * FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _id_alias, int _create); struct alias_link * FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_short _id, int _create); struct alias_link * FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _ip_id); struct alias_link * FindFragmentIn2(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _ip_id); struct alias_link * AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); struct alias_link * FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); struct alias_link * FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_char _proto); struct alias_link * FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_char _proto); struct alias_link * FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _dst_port, u_short _alias_port, u_char _proto, int _create); struct alias_link * FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_short _src_port, u_short _dst_port, u_char _proto, int _create); struct alias_link * AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, struct in_addr _alias_addr, u_int16_t _src_call_id); struct alias_link * FindPptpOutByCallId(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_int16_t _src_call_id); struct alias_link * FindPptpInByCallId(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_int16_t _dst_call_id); struct alias_link * FindPptpOutByPeerCallId(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_int16_t _dst_call_id); struct alias_link * FindPptpInByPeerCallId(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_int16_t _alias_call_id); struct alias_link * FindRtspOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, u_short _src_port, u_short _alias_port, u_char _proto); struct in_addr FindOriginalAddress(struct libalias *la, struct in_addr _alias_addr); struct in_addr FindAliasAddress(struct libalias *la, struct in_addr _original_addr); struct in_addr FindSctpRedirectAddress(struct libalias *la, struct sctp_nat_msg *sm); /* External data access/modification */ int FindNewPortGroup(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _src_port, u_short _dst_port, u_short _port_count, u_char _proto, u_char _align); void GetFragmentAddr(struct alias_link *_lnk, struct in_addr *_src_addr); void SetFragmentAddr(struct alias_link *_lnk, struct in_addr _src_addr); void GetFragmentPtr(struct alias_link *_lnk, char **_fptr); void SetFragmentPtr(struct alias_link *_lnk, char *fptr); void SetStateIn(struct alias_link *_lnk, int _state); void SetStateOut(struct alias_link *_lnk, int _state); int GetStateIn (struct alias_link *_lnk); int GetStateOut(struct alias_link *_lnk); struct in_addr GetOriginalAddress(struct alias_link *_lnk); struct in_addr GetDestAddress(struct alias_link *_lnk); struct in_addr GetAliasAddress(struct alias_link *_lnk); struct in_addr GetDefaultAliasAddress(struct libalias *la); void SetDefaultAliasAddress(struct libalias *la, struct in_addr _alias_addr); u_short GetOriginalPort(struct alias_link *_lnk); u_short GetAliasPort(struct alias_link *_lnk); struct in_addr GetProxyAddress(struct alias_link *_lnk); void SetProxyAddress(struct alias_link *_lnk, struct in_addr _addr); u_short GetProxyPort(struct alias_link *_lnk); void SetProxyPort(struct alias_link *_lnk, u_short _port); void SetAckModified(struct alias_link *_lnk); int GetAckModified(struct alias_link *_lnk); int GetDeltaAckIn(u_long, struct alias_link *_lnk); int GetDeltaSeqOut(u_long, struct alias_link *lnk); void AddSeq(struct alias_link *lnk, int delta, u_int ip_hl, u_short ip_len, u_long th_seq, u_int th_off); void SetExpire (struct alias_link *_lnk, int _expire); void ClearCheckNewLink(struct libalias *la); void SetProtocolFlags(struct alias_link *_lnk, int _pflags); int GetProtocolFlags(struct alias_link *_lnk); void SetDestCallId(struct alias_link *_lnk, u_int16_t _cid); #ifndef NO_FW_PUNCH void PunchFWHole(struct alias_link *_lnk); #endif /* Housekeeping function */ void HouseKeeping(struct libalias *); /* Tcp specific routines */ /* lint -save -library Suppress flexelint warnings */ /* Transparent proxy routines */ int ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr, u_short * proxy_server_port, struct in_addr src_addr, struct in_addr dst_addr, u_short dst_port, u_char ip_p); void ProxyModify(struct libalias *la, struct alias_link *_lnk, struct ip *_pip, int _maxpacketsize, int _proxy_type); enum alias_tcp_state { ALIAS_TCP_STATE_NOT_CONNECTED, ALIAS_TCP_STATE_CONNECTED, ALIAS_TCP_STATE_DISCONNECTED }; #if defined(_NETINET_IP_H_) static __inline void * ip_next(struct ip *iphdr) { char *p = (char *)iphdr; return (&p[iphdr->ip_hl * 4]); } #endif #if defined(_NETINET_TCP_H_) static __inline void * tcp_next(struct tcphdr *tcphdr) { char *p = (char *)tcphdr; return (&p[tcphdr->th_off * 4]); } #endif #if defined(_NETINET_UDP_H_) static __inline void * udp_next(struct udphdr *udphdr) { return ((void *)(udphdr + 1)); } #endif #endif /* !_ALIAS_LOCAL_H_ */ Index: head/sys/netinet/libalias/alias_nbt.c =================================================================== --- head/sys/netinet/libalias/alias_nbt.c (revision 313820) +++ head/sys/netinet/libalias/alias_nbt.c (revision 313821) @@ -1,853 +1,869 @@ /*- * Written by Atsushi Murai * Copyright (c) 1998, System Planning and Engineering Co. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * TODO: * oClean up. * oConsidering for word alignment for other platform. */ #include __FBSDID("$FreeBSD$"); /* alias_nbt.c performs special processing for NetBios over TCP/IP sessions by UDP. Initial version: May, 1998 (Atsushi Murai ) See HISTORY file for record of revisions. */ /* Includes */ #ifdef _KERNEL #include #include #include #include #else #include #include #include #include #endif #include #include #include #include #ifdef _KERNEL #include #include #else #include "alias_local.h" #include "alias_mod.h" #endif #define NETBIOS_NS_PORT_NUMBER 137 #define NETBIOS_DGM_PORT_NUMBER 138 static int AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *, struct in_addr *, u_short); static int AliasHandleUdpNbtNS(struct libalias *, struct ip *, struct alias_link *, struct in_addr *, u_short *, struct in_addr *, u_short *); static int fingerprint1(struct libalias *la, struct alias_data *ah) { if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->aaddr == NULL || ah->aport == NULL) return (-1); if (ntohs(*ah->dport) == NETBIOS_DGM_PORT_NUMBER || ntohs(*ah->sport) == NETBIOS_DGM_PORT_NUMBER) return (0); return (-1); } static int protohandler1(struct libalias *la, struct ip *pip, struct alias_data *ah) { return (AliasHandleUdpNbt(la, pip, ah->lnk, ah->aaddr, *ah->aport)); } static int fingerprint2(struct libalias *la, struct alias_data *ah) { if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->aaddr == NULL || ah->aport == NULL) return (-1); if (ntohs(*ah->dport) == NETBIOS_NS_PORT_NUMBER || ntohs(*ah->sport) == NETBIOS_NS_PORT_NUMBER) return (0); return (-1); } static int protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah) { AliasHandleUdpNbtNS(la, pip, ah->lnk, ah->aaddr, ah->aport, ah->oaddr, ah->dport); return (0); } static int protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah) { return (AliasHandleUdpNbtNS(la, pip, ah->lnk, &pip->ip_src, ah->sport, ah->aaddr, ah->aport)); } /* Kernel module definition. */ struct proto_handler handlers[] = { { .pri = 130, .dir = IN|OUT, .proto = UDP, .fingerprint = &fingerprint1, .protohandler = &protohandler1 }, { .pri = 140, .dir = IN, .proto = UDP, .fingerprint = &fingerprint2, .protohandler = &protohandler2in }, { .pri = 140, .dir = OUT, .proto = UDP, .fingerprint = &fingerprint2, .protohandler = &protohandler2out }, { EOH } }; static int mod_handler(module_t mod, int type, void *data) { int error; switch (type) { case MOD_LOAD: error = 0; LibAliasAttachHandlers(handlers); break; case MOD_UNLOAD: error = 0; LibAliasDetachHandlers(handlers); break; default: error = EINVAL; } return (error); } #ifdef _KERNEL static #endif moduledata_t alias_mod = { "alias_nbt", mod_handler, NULL }; #ifdef _KERNEL DECLARE_MODULE(alias_nbt, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); MODULE_VERSION(alias_nbt, 1); MODULE_DEPEND(alias_nbt, libalias, 1, 1, 1); #endif typedef struct { struct in_addr oldaddr; u_short oldport; struct in_addr newaddr; u_short newport; u_short *uh_sum; } NBTArguments; typedef struct { unsigned char type; unsigned char flags; u_short id; struct in_addr source_ip; u_short source_port; u_short len; u_short offset; } NbtDataHeader; #define OpQuery 0 #define OpUnknown 4 #define OpRegist 5 #define OpRelease 6 #define OpWACK 7 #define OpRefresh 8 typedef struct { u_short nametrid; u_short dir: 1, opcode:4, nmflags:7, rcode:4; u_short qdcount; u_short ancount; u_short nscount; u_short arcount; } NbtNSHeader; #define FMT_ERR 0x1 #define SRV_ERR 0x2 #define IMP_ERR 0x4 #define RFS_ERR 0x5 #define ACT_ERR 0x6 #define CFT_ERR 0x7 #ifdef LIBALIAS_DEBUG static void PrintRcode(u_char rcode) { switch (rcode) { case FMT_ERR: printf("\nFormat Error."); case SRV_ERR: printf("\nSever failure."); case IMP_ERR: printf("\nUnsupported request error.\n"); case RFS_ERR: printf("\nRefused error.\n"); case ACT_ERR: printf("\nActive error.\n"); case CFT_ERR: printf("\nName in conflict error.\n"); default: printf("\n?%c?=%0x\n", '?', rcode); } } #endif /* Handling Name field */ static u_char * AliasHandleName(u_char * p, char *pmax) { u_char *s; u_char c; int compress; /* Following length field */ if (p == NULL || (char *)p >= pmax) return (NULL); if (*p & 0xc0) { p = p + 2; if ((char *)p > pmax) return (NULL); return ((u_char *) p); } while ((*p & 0x3f) != 0x00) { s = p + 1; if (*p == 0x20) compress = 1; else compress = 0; /* Get next length field */ p = (u_char *) (p + (*p & 0x3f) + 1); if ((char *)p > pmax) { p = NULL; break; } #ifdef LIBALIAS_DEBUG printf(":"); #endif while (s < p) { if (compress == 1) { c = (u_char) (((((*s & 0x0f) << 4) | (*(s + 1) & 0x0f)) - 0x11)); #ifdef LIBALIAS_DEBUG if (isprint(c)) printf("%c", c); else printf("<0x%02x>", c); #endif s += 2; } else { #ifdef LIBALIAS_DEBUG printf("%c", *s); #endif s++; } } #ifdef LIBALIAS_DEBUG printf(":"); fflush(stdout); #endif } /* Set up to out of Name field */ if (p == NULL || (char *)p >= pmax) p = NULL; else p++; return ((u_char *) p); } /* * NetBios Datagram Handler (IP/UDP) */ #define DGM_DIRECT_UNIQ 0x10 #define DGM_DIRECT_GROUP 0x11 #define DGM_BROADCAST 0x12 #define DGM_ERROR 0x13 #define DGM_QUERY 0x14 #define DGM_POSITIVE_RES 0x15 #define DGM_NEGATIVE_RES 0x16 static int AliasHandleUdpNbt( struct libalias *la, struct ip *pip, /* IP packet to examine/patch */ struct alias_link *lnk, struct in_addr *alias_address, u_short alias_port ) { struct udphdr *uh; NbtDataHeader *ndh; u_char *p = NULL; char *pmax; +#ifdef LIBALIAS_DEBUG + char addrbuf[INET_ADDRSTRLEN]; +#endif (void)la; (void)lnk; /* Calculate data length of UDP packet */ uh = (struct udphdr *)ip_next(pip); pmax = (char *)uh + ntohs(uh->uh_ulen); ndh = (NbtDataHeader *)udp_next(uh); if ((char *)(ndh + 1) > pmax) return (-1); #ifdef LIBALIAS_DEBUG printf("\nType=%02x,", ndh->type); #endif switch (ndh->type) { case DGM_DIRECT_UNIQ: case DGM_DIRECT_GROUP: case DGM_BROADCAST: p = (u_char *) ndh + 14; p = AliasHandleName(p, pmax); /* Source Name */ p = AliasHandleName(p, pmax); /* Destination Name */ break; case DGM_ERROR: p = (u_char *) ndh + 11; break; case DGM_QUERY: case DGM_POSITIVE_RES: case DGM_NEGATIVE_RES: p = (u_char *) ndh + 10; p = AliasHandleName(p, pmax); /* Destination Name */ break; } if (p == NULL || (char *)p > pmax) p = NULL; #ifdef LIBALIAS_DEBUG - printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port)); + printf("%s:%d-->", inet_ntoa_r(ndh->source_ip, INET_NTOA_BUF(addrbuf)), + ntohs(ndh->source_port)); #endif /* Doing an IP address and Port number Translation */ if (uh->uh_sum != 0) { int acc; u_short *sptr; acc = ndh->source_port; acc -= alias_port; sptr = (u_short *) & (ndh->source_ip); acc += *sptr++; acc += *sptr; sptr = (u_short *) alias_address; acc -= *sptr++; acc -= *sptr; ADJUST_CHECKSUM(acc, uh->uh_sum); } ndh->source_ip = *alias_address; ndh->source_port = alias_port; #ifdef LIBALIAS_DEBUG - printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port)); + printf("%s:%d\n", inet_ntoa_r(ndh->source_ip, INET_NTOA_BUF(addrbuf)), + ntohs(ndh->source_port)); fflush(stdout); #endif return ((p == NULL) ? -1 : 0); } /* Question Section */ #define QS_TYPE_NB 0x0020 #define QS_TYPE_NBSTAT 0x0021 #define QS_CLAS_IN 0x0001 typedef struct { u_short type; /* The type of Request */ u_short class; /* The class of Request */ } NBTNsQuestion; static u_char * AliasHandleQuestion( u_short count, NBTNsQuestion * q, char *pmax, NBTArguments * nbtarg) { (void)nbtarg; while (count != 0) { /* Name Filed */ q = (NBTNsQuestion *) AliasHandleName((u_char *) q, pmax); if (q == NULL || (char *)(q + 1) > pmax) { q = NULL; break; } /* Type and Class filed */ switch (ntohs(q->type)) { case QS_TYPE_NB: case QS_TYPE_NBSTAT: q = q + 1; break; default: #ifdef LIBALIAS_DEBUG printf("\nUnknown Type on Question %0x\n", ntohs(q->type)); #endif break; } count--; } /* Set up to out of Question Section */ return ((u_char *) q); } /* Resource Record */ #define RR_TYPE_A 0x0001 #define RR_TYPE_NS 0x0002 #define RR_TYPE_NULL 0x000a #define RR_TYPE_NB 0x0020 #define RR_TYPE_NBSTAT 0x0021 #define RR_CLAS_IN 0x0001 #define SizeOfNsResource 8 typedef struct { u_short type; u_short class; unsigned int ttl; u_short rdlen; } NBTNsResource; #define SizeOfNsRNB 6 typedef struct { u_short g: 1 , ont:2, resv:13; struct in_addr addr; } NBTNsRNB; static u_char * AliasHandleResourceNB( NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { NBTNsRNB *nb; u_short bcount; +#ifdef LIBALIAS_DEBUG + char oldbuf[INET_ADDRSTRLEN]; + char newbuf[INET_ADDRSTRLEN]; +#endif if (q == NULL || (char *)(q + 1) > pmax) return (NULL); /* Check out a length */ bcount = ntohs(q->rdlen); /* Forward to Resource NB position */ nb = (NBTNsRNB *) ((u_char *) q + SizeOfNsResource); /* Processing all in_addr array */ #ifdef LIBALIAS_DEBUG - printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr)); - printf("->%s, %dbytes] ", inet_ntoa(nbtarg->newaddr), bcount); + printf("NB rec[%s->%s, %dbytes] ", + inet_ntoa_r(nbtarg->oldaddr, INET_NTOA_BUF(oldbuf)), + inet_ntoa_r(nbtarg->newaddr, INET_NTOA_BUF(newbuf)), + bcount); #endif while (nb != NULL && bcount != 0) { if ((char *)(nb + 1) > pmax) { nb = NULL; break; } #ifdef LIBALIAS_DEBUG - printf("<%s>", inet_ntoa(nb->addr)); + printf("<%s>", inet_ntoa_r(nb->addr, INET_NTOA_BUF(newbuf))); #endif if (!bcmp(&nbtarg->oldaddr, &nb->addr, sizeof(struct in_addr))) { if (*nbtarg->uh_sum != 0) { int acc; u_short *sptr; sptr = (u_short *) & (nb->addr); acc = *sptr++; acc += *sptr; sptr = (u_short *) & (nbtarg->newaddr); acc -= *sptr++; acc -= *sptr; ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); } nb->addr = nbtarg->newaddr; #ifdef LIBALIAS_DEBUG printf("O"); #endif } #ifdef LIBALIAS_DEBUG else { printf("."); } #endif nb = (NBTNsRNB *) ((u_char *) nb + SizeOfNsRNB); bcount -= SizeOfNsRNB; } if (nb == NULL || (char *)(nb + 1) > pmax) { nb = NULL; } return ((u_char *) nb); } #define SizeOfResourceA 6 typedef struct { struct in_addr addr; } NBTNsResourceA; static u_char * AliasHandleResourceA( NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { NBTNsResourceA *a; u_short bcount; +#ifdef LIBALIAS_DEBUG + char oldbuf[INET_ADDRSTRLEN]; + char newbuf[INET_ADDRSTRLEN]; +#endif if (q == NULL || (char *)(q + 1) > pmax) return (NULL); /* Forward to Resource A position */ a = (NBTNsResourceA *) ((u_char *) q + sizeof(NBTNsResource)); /* Check out of length */ bcount = ntohs(q->rdlen); /* Processing all in_addr array */ #ifdef LIBALIAS_DEBUG - printf("Arec [%s", inet_ntoa(nbtarg->oldaddr)); - printf("->%s]", inet_ntoa(nbtarg->newaddr)); + printf("Arec [%s->%s]", + inet_ntoa_r(nbtarg->oldaddr, INET_NTOA_BUF(oldbuf)), + inet_ntoa_r(nbtarg->newaddr, INET_NTOA_BUF(newbuf))); #endif while (bcount != 0) { if (a == NULL || (char *)(a + 1) > pmax) return (NULL); #ifdef LIBALIAS_DEBUG - printf("..%s", inet_ntoa(a->addr)); + printf("..%s", inet_ntoa_r(a->addr, INET_NTOA_BUF(newbuf))); #endif if (!bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr))) { if (*nbtarg->uh_sum != 0) { int acc; u_short *sptr; sptr = (u_short *) & (a->addr); /* Old */ acc = *sptr++; acc += *sptr; sptr = (u_short *) & nbtarg->newaddr; /* New */ acc -= *sptr++; acc -= *sptr; ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); } a->addr = nbtarg->newaddr; } a++; /* XXXX */ bcount -= SizeOfResourceA; } if (a == NULL || (char *)(a + 1) > pmax) a = NULL; return ((u_char *) a); } typedef struct { u_short opcode:4, flags:8, resv:4; } NBTNsResourceNULL; static u_char * AliasHandleResourceNULL( NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { NBTNsResourceNULL *n; u_short bcount; (void)nbtarg; if (q == NULL || (char *)(q + 1) > pmax) return (NULL); /* Forward to Resource NULL position */ n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource)); /* Check out of length */ bcount = ntohs(q->rdlen); /* Processing all in_addr array */ while (bcount != 0) { if ((char *)(n + 1) > pmax) { n = NULL; break; } n++; bcount -= sizeof(NBTNsResourceNULL); } if ((char *)(n + 1) > pmax) n = NULL; return ((u_char *) n); } static u_char * AliasHandleResourceNS( NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { NBTNsResourceNULL *n; u_short bcount; (void)nbtarg; if (q == NULL || (char *)(q + 1) > pmax) return (NULL); /* Forward to Resource NULL position */ n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource)); /* Check out of length */ bcount = ntohs(q->rdlen); /* Resource Record Name Filed */ q = (NBTNsResource *) AliasHandleName((u_char *) n, pmax); /* XXX */ if (q == NULL || (char *)((u_char *) n + bcount) > pmax) return (NULL); else return ((u_char *) n + bcount); } typedef struct { u_short numnames; } NBTNsResourceNBSTAT; static u_char * AliasHandleResourceNBSTAT( NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { NBTNsResourceNBSTAT *n; u_short bcount; (void)nbtarg; if (q == NULL || (char *)(q + 1) > pmax) return (NULL); /* Forward to Resource NBSTAT position */ n = (NBTNsResourceNBSTAT *) ((u_char *) q + sizeof(NBTNsResource)); /* Check out of length */ bcount = ntohs(q->rdlen); if (q == NULL || (char *)((u_char *) n + bcount) > pmax) return (NULL); else return ((u_char *) n + bcount); } static u_char * AliasHandleResource( u_short count, NBTNsResource * q, char *pmax, NBTArguments * nbtarg) { while (count != 0) { /* Resource Record Name Filed */ q = (NBTNsResource *) AliasHandleName((u_char *) q, pmax); if (q == NULL || (char *)(q + 1) > pmax) break; #ifdef LIBALIAS_DEBUG printf("type=%02x, count=%d\n", ntohs(q->type), count); #endif /* Type and Class filed */ switch (ntohs(q->type)) { case RR_TYPE_NB: q = (NBTNsResource *) AliasHandleResourceNB( q, pmax, nbtarg ); break; case RR_TYPE_A: q = (NBTNsResource *) AliasHandleResourceA( q, pmax, nbtarg ); break; case RR_TYPE_NS: q = (NBTNsResource *) AliasHandleResourceNS( q, pmax, nbtarg ); break; case RR_TYPE_NULL: q = (NBTNsResource *) AliasHandleResourceNULL( q, pmax, nbtarg ); break; case RR_TYPE_NBSTAT: q = (NBTNsResource *) AliasHandleResourceNBSTAT( q, pmax, nbtarg ); break; default: #ifdef LIBALIAS_DEBUG printf( "\nUnknown Type of Resource %0x\n", ntohs(q->type) ); fflush(stdout); #endif break; } count--; } return ((u_char *) q); } static int AliasHandleUdpNbtNS( struct libalias *la, struct ip *pip, /* IP packet to examine/patch */ struct alias_link *lnk, struct in_addr *alias_address, u_short * alias_port, struct in_addr *original_address, u_short * original_port) { struct udphdr *uh; NbtNSHeader *nsh; u_char *p; char *pmax; NBTArguments nbtarg; (void)la; (void)lnk; /* Set up Common Parameter */ nbtarg.oldaddr = *alias_address; nbtarg.oldport = *alias_port; nbtarg.newaddr = *original_address; nbtarg.newport = *original_port; /* Calculate data length of UDP packet */ uh = (struct udphdr *)ip_next(pip); nbtarg.uh_sum = &(uh->uh_sum); nsh = (NbtNSHeader *)udp_next(uh); p = (u_char *) (nsh + 1); pmax = (char *)uh + ntohs(uh->uh_ulen); if ((char *)(nsh + 1) > pmax) return (-1); #ifdef LIBALIAS_DEBUG printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x" ", an=%04x, ns=%04x, ar=%04x, [%d]-->", nsh->dir ? "Response" : "Request", nsh->nametrid, nsh->opcode, nsh->nmflags, nsh->rcode, ntohs(nsh->qdcount), ntohs(nsh->ancount), ntohs(nsh->nscount), ntohs(nsh->arcount), (u_char *) p - (u_char *) nsh ); #endif /* Question Entries */ if (ntohs(nsh->qdcount) != 0) { p = AliasHandleQuestion( ntohs(nsh->qdcount), (NBTNsQuestion *) p, pmax, &nbtarg ); } /* Answer Resource Records */ if (ntohs(nsh->ancount) != 0) { p = AliasHandleResource( ntohs(nsh->ancount), (NBTNsResource *) p, pmax, &nbtarg ); } /* Authority Resource Recodrs */ if (ntohs(nsh->nscount) != 0) { p = AliasHandleResource( ntohs(nsh->nscount), (NBTNsResource *) p, pmax, &nbtarg ); } /* Additional Resource Recodrs */ if (ntohs(nsh->arcount) != 0) { p = AliasHandleResource( ntohs(nsh->arcount), (NBTNsResource *) p, pmax, &nbtarg ); } #ifdef LIBALIAS_DEBUG PrintRcode(nsh->rcode); #endif return ((p == NULL) ? -1 : 0); } Index: head/sys/netinet/libalias/alias_proxy.c =================================================================== --- head/sys/netinet/libalias/alias_proxy.c (revision 313820) +++ head/sys/netinet/libalias/alias_proxy.c (revision 313821) @@ -1,868 +1,870 @@ /*- * Copyright (c) 2001 Charles Mott * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* file: alias_proxy.c This file encapsulates special operations related to transparent proxy redirection. This is where packets with a particular destination, usually tcp port 80, are redirected to a proxy server. When packets are proxied, the destination address and port are modified. In certain cases, it is necessary to somehow encode the original address/port info into the packet. Two methods are presently supported: addition of a [DEST addr port] string at the beginning of a tcp stream, or inclusion of an optional field in the IP header. There is one public API function: PacketAliasProxyRule() -- Adds and deletes proxy rules. Rules are stored in a linear linked list, so lookup efficiency won't be too good for large lists. Initial development: April, 1998 (cjm) */ /* System includes */ #ifdef _KERNEL #include #include #include #include #else #include #include #include #include #include #include #endif #include #ifdef _KERNEL #include #include #include #else #include #include "alias.h" /* Public API functions for libalias */ #include "alias_local.h" /* Functions used by alias*.c */ #endif /* Data structures */ /* * A linked list of arbitrary length, based on struct proxy_entry is * used to store proxy rules. */ struct proxy_entry { struct libalias *la; #define PROXY_TYPE_ENCODE_NONE 1 #define PROXY_TYPE_ENCODE_TCPSTREAM 2 #define PROXY_TYPE_ENCODE_IPHDR 3 int rule_index; int proxy_type; u_char proto; u_short proxy_port; u_short server_port; struct in_addr server_addr; struct in_addr src_addr; struct in_addr src_mask; struct in_addr dst_addr; struct in_addr dst_mask; struct proxy_entry *next; struct proxy_entry *last; }; /* File scope variables */ /* Local (static) functions: IpMask() -- Utility function for creating IP masks from integer (1-32) specification. IpAddr() -- Utility function for converting string to IP address IpPort() -- Utility function for converting string to port number RuleAdd() -- Adds an element to the rule list. RuleDelete() -- Removes an element from the rule list. RuleNumberDelete() -- Removes all elements from the rule list having a certain rule number. ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning of a TCP stream. ProxyEncodeIpHeader() -- Adds an IP option indicating the true destination of a proxied IP packet */ static int IpMask(int, struct in_addr *); static int IpAddr(char *, struct in_addr *); static int IpPort(char *, int, int *); static void RuleAdd(struct libalias *la, struct proxy_entry *); static void RuleDelete(struct proxy_entry *); static int RuleNumberDelete(struct libalias *la, int); static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int); static void ProxyEncodeIpHeader(struct ip *, int); static int IpMask(int nbits, struct in_addr *mask) { int i; u_int imask; if (nbits < 0 || nbits > 32) return (-1); imask = 0; for (i = 0; i < nbits; i++) imask = (imask >> 1) + 0x80000000; mask->s_addr = htonl(imask); return (0); } static int IpAddr(char *s, struct in_addr *addr) { if (inet_aton(s, addr) == 0) return (-1); else return (0); } static int IpPort(char *s, int proto, int *port) { int n; n = sscanf(s, "%d", port); if (n != 1) #ifndef _KERNEL /* XXX: we accept only numeric ports in kernel */ { struct servent *se; if (proto == IPPROTO_TCP) se = getservbyname(s, "tcp"); else if (proto == IPPROTO_UDP) se = getservbyname(s, "udp"); else return (-1); if (se == NULL) return (-1); *port = (u_int) ntohs(se->s_port); } #else return (-1); #endif return (0); } void RuleAdd(struct libalias *la, struct proxy_entry *entry) { int rule_index; struct proxy_entry *ptr; struct proxy_entry *ptr_last; LIBALIAS_LOCK_ASSERT(la); entry->la = la; if (la->proxyList == NULL) { la->proxyList = entry; entry->last = NULL; entry->next = NULL; return; } rule_index = entry->rule_index; ptr = la->proxyList; ptr_last = NULL; while (ptr != NULL) { if (ptr->rule_index >= rule_index) { if (ptr_last == NULL) { entry->next = la->proxyList; entry->last = NULL; la->proxyList->last = entry; la->proxyList = entry; return; } ptr_last->next = entry; ptr->last = entry; entry->last = ptr->last; entry->next = ptr; return; } ptr_last = ptr; ptr = ptr->next; } ptr_last->next = entry; entry->last = ptr_last; entry->next = NULL; } static void RuleDelete(struct proxy_entry *entry) { struct libalias *la; la = entry->la; LIBALIAS_LOCK_ASSERT(la); if (entry->last != NULL) entry->last->next = entry->next; else la->proxyList = entry->next; if (entry->next != NULL) entry->next->last = entry->last; free(entry); } static int RuleNumberDelete(struct libalias *la, int rule_index) { int err; struct proxy_entry *ptr; LIBALIAS_LOCK_ASSERT(la); err = -1; ptr = la->proxyList; while (ptr != NULL) { struct proxy_entry *ptr_next; ptr_next = ptr->next; if (ptr->rule_index == rule_index) { err = 0; RuleDelete(ptr); } ptr = ptr_next; } return (err); } static void ProxyEncodeTcpStream(struct alias_link *lnk, struct ip *pip, int maxpacketsize) { int slen; char buffer[40]; struct tcphdr *tc; + char addrbuf[INET_ADDRSTRLEN]; /* Compute pointer to tcp header */ tc = (struct tcphdr *)ip_next(pip); /* Don't modify if once already modified */ if (GetAckModified(lnk)) return; /* Translate destination address and port to string form */ snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]", - inet_ntoa(GetProxyAddress(lnk)), (u_int) ntohs(GetProxyPort(lnk))); + inet_ntoa_r(GetProxyAddress(lnk), INET_NTOA_BUF(addrbuf)), + (u_int) ntohs(GetProxyPort(lnk))); /* Pad string out to a multiple of two in length */ slen = strlen(buffer); switch (slen % 2) { case 0: strcat(buffer, " \n"); slen += 2; break; case 1: strcat(buffer, "\n"); slen += 1; } /* Check for packet overflow */ if ((int)(ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize) return; /* Shift existing TCP data and insert destination string */ { int dlen; int hlen; char *p; hlen = (pip->ip_hl + tc->th_off) << 2; dlen = ntohs(pip->ip_len) - hlen; /* Modify first packet that has data in it */ if (dlen == 0) return; p = (char *)pip; p += hlen; bcopy(p, p + slen, dlen); memcpy(p, buffer, slen); } /* Save information about modfied sequence number */ { int delta; SetAckModified(lnk); tc = (struct tcphdr *)ip_next(pip); delta = GetDeltaSeqOut(tc->th_seq, lnk); AddSeq(lnk, delta + slen, pip->ip_hl, pip->ip_len, tc->th_seq, tc->th_off); } /* Update IP header packet length and checksum */ { int accumulate; accumulate = pip->ip_len; pip->ip_len = htons(ntohs(pip->ip_len) + slen); accumulate -= pip->ip_len; ADJUST_CHECKSUM(accumulate, pip->ip_sum); } /* Update TCP checksum, Use TcpChecksum since so many things have already changed. */ tc->th_sum = 0; #ifdef _KERNEL tc->th_x2 = 1; #else tc->th_sum = TcpChecksum(pip); #endif } static void ProxyEncodeIpHeader(struct ip *pip, int maxpacketsize) { #define OPTION_LEN_BYTES 8 #define OPTION_LEN_INT16 4 #define OPTION_LEN_INT32 2 u_char option[OPTION_LEN_BYTES]; #ifdef LIBALIAS_DEBUG fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip)); fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip)); #endif (void)maxpacketsize; /* Check to see that there is room to add an IP option */ if (pip->ip_hl > (0x0f - OPTION_LEN_INT32)) return; /* Build option and copy into packet */ { u_char *ptr; struct tcphdr *tc; ptr = (u_char *) pip; ptr += 20; memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20); option[0] = 0x64; /* class: 3 (reserved), option 4 */ option[1] = OPTION_LEN_BYTES; memcpy(&option[2], (u_char *) & pip->ip_dst, 4); tc = (struct tcphdr *)ip_next(pip); memcpy(&option[6], (u_char *) & tc->th_sport, 2); memcpy(ptr, option, 8); } /* Update checksum, header length and packet length */ { int i; int accumulate; u_short *sptr; sptr = (u_short *) option; accumulate = 0; for (i = 0; i < OPTION_LEN_INT16; i++) accumulate -= *(sptr++); sptr = (u_short *) pip; accumulate += *sptr; pip->ip_hl += OPTION_LEN_INT32; accumulate -= *sptr; accumulate += pip->ip_len; pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES); accumulate -= pip->ip_len; ADJUST_CHECKSUM(accumulate, pip->ip_sum); } #undef OPTION_LEN_BYTES #undef OPTION_LEN_INT16 #undef OPTION_LEN_INT32 #ifdef LIBALIAS_DEBUG fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip)); fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip)); #endif } /* Functions by other packet alias source files ProxyCheck() -- Checks whether an outgoing packet should be proxied. ProxyModify() -- Encodes the original destination address/port for a packet which is to be redirected to a proxy server. */ int ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr, u_short * proxy_server_port, struct in_addr src_addr, struct in_addr dst_addr, u_short dst_port, u_char ip_p) { struct proxy_entry *ptr; LIBALIAS_LOCK_ASSERT(la); ptr = la->proxyList; while (ptr != NULL) { u_short proxy_port; proxy_port = ptr->proxy_port; if ((dst_port == proxy_port || proxy_port == 0) && ip_p == ptr->proto && src_addr.s_addr != ptr->server_addr.s_addr) { struct in_addr src_addr_masked; struct in_addr dst_addr_masked; src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr; dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr; if ((src_addr_masked.s_addr == ptr->src_addr.s_addr) && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) { if ((*proxy_server_port = ptr->server_port) == 0) *proxy_server_port = dst_port; *proxy_server_addr = ptr->server_addr; return (ptr->proxy_type); } } ptr = ptr->next; } return (0); } void ProxyModify(struct libalias *la, struct alias_link *lnk, struct ip *pip, int maxpacketsize, int proxy_type) { LIBALIAS_LOCK_ASSERT(la); (void)la; switch (proxy_type) { case PROXY_TYPE_ENCODE_IPHDR: ProxyEncodeIpHeader(pip, maxpacketsize); break; case PROXY_TYPE_ENCODE_TCPSTREAM: ProxyEncodeTcpStream(lnk, pip, maxpacketsize); break; } } /* Public API functions */ int LibAliasProxyRule(struct libalias *la, const char *cmd) { /* * This function takes command strings of the form: * * server [:] * [port ] * [rule n] * [proto tcp|udp] * [src [/n]] * [dst [/n]] * [type encode_tcp_stream|encode_ip_hdr|no_encode] * * delete * * Subfields can be in arbitrary order. Port numbers and addresses * must be in either numeric or symbolic form. An optional rule number * is used to control the order in which rules are searched. If two * rules have the same number, then search order cannot be guaranteed, * and the rules should be disjoint. If no rule number is specified, * then 0 is used, and group 0 rules are always checked before any * others. */ int i, n, len, ret; int cmd_len; int token_count; int state; char *token; char buffer[256]; char str_port[sizeof(buffer)]; char str_server_port[sizeof(buffer)]; char *res = buffer; int rule_index; int proto; int proxy_type; int proxy_port; int server_port; struct in_addr server_addr; struct in_addr src_addr, src_mask; struct in_addr dst_addr, dst_mask; struct proxy_entry *proxy_entry; LIBALIAS_LOCK(la); ret = 0; /* Copy command line into a buffer */ cmd += strspn(cmd, " \t"); cmd_len = strlen(cmd); if (cmd_len > (int)(sizeof(buffer) - 1)) { ret = -1; goto getout; } strcpy(buffer, cmd); /* Convert to lower case */ len = strlen(buffer); for (i = 0; i < len; i++) buffer[i] = tolower((unsigned char)buffer[i]); /* Set default proxy type */ /* Set up default values */ rule_index = 0; proxy_type = PROXY_TYPE_ENCODE_NONE; proto = IPPROTO_TCP; proxy_port = 0; server_addr.s_addr = 0; server_port = 0; src_addr.s_addr = 0; IpMask(0, &src_mask); dst_addr.s_addr = 0; IpMask(0, &dst_mask); str_port[0] = 0; str_server_port[0] = 0; /* Parse command string with state machine */ #define STATE_READ_KEYWORD 0 #define STATE_READ_TYPE 1 #define STATE_READ_PORT 2 #define STATE_READ_SERVER 3 #define STATE_READ_RULE 4 #define STATE_READ_DELETE 5 #define STATE_READ_PROTO 6 #define STATE_READ_SRC 7 #define STATE_READ_DST 8 state = STATE_READ_KEYWORD; token = strsep(&res, " \t"); token_count = 0; while (token != NULL) { token_count++; switch (state) { case STATE_READ_KEYWORD: if (strcmp(token, "type") == 0) state = STATE_READ_TYPE; else if (strcmp(token, "port") == 0) state = STATE_READ_PORT; else if (strcmp(token, "server") == 0) state = STATE_READ_SERVER; else if (strcmp(token, "rule") == 0) state = STATE_READ_RULE; else if (strcmp(token, "delete") == 0) state = STATE_READ_DELETE; else if (strcmp(token, "proto") == 0) state = STATE_READ_PROTO; else if (strcmp(token, "src") == 0) state = STATE_READ_SRC; else if (strcmp(token, "dst") == 0) state = STATE_READ_DST; else { ret = -1; goto getout; } break; case STATE_READ_TYPE: if (strcmp(token, "encode_ip_hdr") == 0) proxy_type = PROXY_TYPE_ENCODE_IPHDR; else if (strcmp(token, "encode_tcp_stream") == 0) proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM; else if (strcmp(token, "no_encode") == 0) proxy_type = PROXY_TYPE_ENCODE_NONE; else { ret = -1; goto getout; } state = STATE_READ_KEYWORD; break; case STATE_READ_PORT: strcpy(str_port, token); state = STATE_READ_KEYWORD; break; case STATE_READ_SERVER: { int err; char *p; char s[sizeof(buffer)]; p = token; while (*p != ':' && *p != 0) p++; if (*p != ':') { err = IpAddr(token, &server_addr); if (err) { ret = -1; goto getout; } } else { *p = ' '; n = sscanf(token, "%s %s", s, str_server_port); if (n != 2) { ret = -1; goto getout; } err = IpAddr(s, &server_addr); if (err) { ret = -1; goto getout; } } } state = STATE_READ_KEYWORD; break; case STATE_READ_RULE: n = sscanf(token, "%d", &rule_index); if (n != 1 || rule_index < 0) { ret = -1; goto getout; } state = STATE_READ_KEYWORD; break; case STATE_READ_DELETE: { int err; int rule_to_delete; if (token_count != 2) { ret = -1; goto getout; } n = sscanf(token, "%d", &rule_to_delete); if (n != 1) { ret = -1; goto getout; } err = RuleNumberDelete(la, rule_to_delete); if (err) ret = -1; ret = 0; goto getout; } case STATE_READ_PROTO: if (strcmp(token, "tcp") == 0) proto = IPPROTO_TCP; else if (strcmp(token, "udp") == 0) proto = IPPROTO_UDP; else { ret = -1; goto getout; } state = STATE_READ_KEYWORD; break; case STATE_READ_SRC: case STATE_READ_DST: { int err; char *p; struct in_addr mask; struct in_addr addr; p = token; while (*p != '/' && *p != 0) p++; if (*p != '/') { IpMask(32, &mask); err = IpAddr(token, &addr); if (err) { ret = -1; goto getout; } } else { int nbits; char s[sizeof(buffer)]; *p = ' '; n = sscanf(token, "%s %d", s, &nbits); if (n != 2) { ret = -1; goto getout; } err = IpAddr(s, &addr); if (err) { ret = -1; goto getout; } err = IpMask(nbits, &mask); if (err) { ret = -1; goto getout; } } if (state == STATE_READ_SRC) { src_addr = addr; src_mask = mask; } else { dst_addr = addr; dst_mask = mask; } } state = STATE_READ_KEYWORD; break; default: ret = -1; goto getout; break; } do { token = strsep(&res, " \t"); } while (token != NULL && !*token); } #undef STATE_READ_KEYWORD #undef STATE_READ_TYPE #undef STATE_READ_PORT #undef STATE_READ_SERVER #undef STATE_READ_RULE #undef STATE_READ_DELETE #undef STATE_READ_PROTO #undef STATE_READ_SRC #undef STATE_READ_DST /* Convert port strings to numbers. This needs to be done after the string is parsed, because the prototype might not be designated before the ports (which might be symbolic entries in /etc/services) */ if (strlen(str_port) != 0) { int err; err = IpPort(str_port, proto, &proxy_port); if (err) { ret = -1; goto getout; } } else { proxy_port = 0; } if (strlen(str_server_port) != 0) { int err; err = IpPort(str_server_port, proto, &server_port); if (err) { ret = -1; goto getout; } } else { server_port = 0; } /* Check that at least the server address has been defined */ if (server_addr.s_addr == 0) { ret = -1; goto getout; } /* Add to linked list */ proxy_entry = malloc(sizeof(struct proxy_entry)); if (proxy_entry == NULL) { ret = -1; goto getout; } proxy_entry->proxy_type = proxy_type; proxy_entry->rule_index = rule_index; proxy_entry->proto = proto; proxy_entry->proxy_port = htons(proxy_port); proxy_entry->server_port = htons(server_port); proxy_entry->server_addr = server_addr; proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr; proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr; proxy_entry->src_mask = src_mask; proxy_entry->dst_mask = dst_mask; RuleAdd(la, proxy_entry); getout: LIBALIAS_UNLOCK(la); return (ret); } Index: head/sys/netinet/libalias/alias_sctp.c =================================================================== --- head/sys/netinet/libalias/alias_sctp.c (revision 313820) +++ head/sys/netinet/libalias/alias_sctp.c (revision 313821) @@ -1,2698 +1,2704 @@ /*- * Copyright (c) 2008 * Swinburne University of Technology, Melbourne, Australia. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Alias_sctp forms part of the libalias kernel module to handle * Network Address Translation (NAT) for the SCTP protocol. * * This software was developed by David A. Hayes and Jason But * * The design is outlined in CAIA technical report number 080618A * (D. Hayes and J. But, "Alias_sctp Version 0.1: SCTP NAT implementation in IPFW") * * Development is part of the CAIA SONATA project, * proposed by Jason But and Grenville Armitage: * http://caia.swin.edu.au/urp/sonata/ * * * This project has been made possible in part by a grant from * the Cisco University Research Program Fund at Community * Foundation Silicon Valley. * */ /** @mainpage * Alias_sctp is part of the SONATA (http://caia.swin.edu.au/urp/sonata) project * to develop and release a BSD licensed implementation of a Network Address * Translation (NAT) module that supports the Stream Control Transmission * Protocol (SCTP). * * Traditional address and port number look ups are inadequate for SCTP's * operation due to both processing requirements and issues with multi-homing. * Alias_sctp integrates with FreeBSD's ipfw/libalias NAT system. * * Version 0.2 features include: * - Support for global multi-homing * - Support for ASCONF modification from Internet Draft * (draft-stewart-behave-sctpnat-04, R. Stewart and M. Tuexen, "Stream control * transmission protocol (SCTP) network address translation," Jul. 2008) to * provide support for multi-homed privately addressed hosts * - Support for forwarding of T-flagged packets * - Generation and delivery of AbortM/ErrorM packets upon detection of NAT * collisions * - Per-port forwarding rules * - Dynamically controllable logging and statistics * - Dynamic management of timers * - Dynamic control of hash-table size */ /* $FreeBSD$ */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #else #include "alias_sctp.h" #include #include "alias.h" #include "alias_local.h" #include #include #endif //#ifdef _KERNEL /* ---------------------------------------------------------------------- * FUNCTION PROTOTYPES * ---------------------------------------------------------------------- */ /* Packet Parsing Functions */ static int sctp_PktParser(struct libalias *la, int direction, struct ip *pip, struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc); static int GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm, uint32_t *l_vtag, uint32_t *g_vtag, int direction); static int IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction); static void AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction); static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr); static void RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction); static int IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction); /* State Machine Functions */ static int ProcessSctpMsg(struct libalias *la, int direction, \ struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc); static int ID_process(struct libalias *la, int direction,\ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); static int INi_process(struct libalias *la, int direction,\ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); static int INa_process(struct libalias *la, int direction,\ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); static int UP_process(struct libalias *la, int direction,\ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); static int CL_process(struct libalias *la, int direction,\ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); static void TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm,\ struct sctp_nat_assoc *assoc, int sndrply, int direction); /* Hash Table Functions */ static struct sctp_nat_assoc* FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port); static struct sctp_nat_assoc* FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match); static struct sctp_nat_assoc* FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc); static struct sctp_nat_assoc* FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port); static struct sctp_nat_assoc* FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port); static int AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr); static int AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc); static void RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc); static void freeGlobalAddressList(struct sctp_nat_assoc *assoc); /* Timer Queue Functions */ static void sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc); static void sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc); static void sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp); void sctp_CheckTimers(struct libalias *la); /* Logging Functions */ static void logsctperror(char* errormsg, uint32_t vtag, int error, int direction); static void logsctpparse(int direction, struct sctp_nat_msg *sm); static void logsctpassoc(struct sctp_nat_assoc *assoc, char *s); static void logTimerQ(struct libalias *la); static void logSctpGlobal(struct libalias *la); static void logSctpLocal(struct libalias *la); #ifdef _KERNEL static void SctpAliasLog(const char *format, ...); #endif /** @defgroup external External code changes and modifications * * Some changes have been made to files external to alias_sctp.(c|h). These * changes are primarily due to code needing to call static functions within * those files or to perform extra functionality that can only be performed * within these files. */ /** @ingroup external * @brief Log current statistics for the libalias instance * * This function is defined in alias_db.c, since it calls static functions in * this file * * Calls the higher level ShowAliasStats() in alias_db.c which logs all current * statistics about the libalias instance - including SCTP statistics * * @param la Pointer to the libalias instance */ void SctpShowAliasStats(struct libalias *la); #ifdef _KERNEL static MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs"); /* Use kernel allocator. */ #ifdef _SYS_MALLOC_H_ #define sn_malloc(x) malloc(x, M_SCTPNAT, M_NOWAIT|M_ZERO) #define sn_calloc(n,x) sn_malloc(x * n) #define sn_free(x) free(x, M_SCTPNAT) #endif// #ifdef _SYS_MALLOC_H_ #else //#ifdef _KERNEL #define sn_malloc(x) malloc(x) #define sn_calloc(n, x) calloc(n, x) #define sn_free(x) free(x) #endif //#ifdef _KERNEL /** @defgroup packet_parser SCTP Packet Parsing * * Macros to: * - Return pointers to the first and next SCTP chunks within an SCTP Packet * - Define possible return values of the packet parsing process * - SCTP message types for storing in the sctp_nat_msg structure @{ */ #define SN_SCTP_FIRSTCHUNK(sctphead) (struct sctp_chunkhdr *)(((char *)sctphead) + sizeof(struct sctphdr)) /**< Returns a pointer to the first chunk in an SCTP packet given a pointer to the SCTP header */ #define SN_SCTP_NEXTCHUNK(chunkhead) (struct sctp_chunkhdr *)(((char *)chunkhead) + SCTP_SIZE32(ntohs(chunkhead->chunk_length))) /**< Returns a pointer to the next chunk in an SCTP packet given a pointer to the current chunk */ #define SN_SCTP_NEXTPARAM(param) (struct sctp_paramhdr *)(((char *)param) + SCTP_SIZE32(ntohs(param->param_length))) /**< Returns a pointer to the next parameter in an SCTP packet given a pointer to the current parameter */ #define SN_MIN_CHUNK_SIZE 4 /**< Smallest possible SCTP chunk size in bytes */ #define SN_MIN_PARAM_SIZE 4 /**< Smallest possible SCTP param size in bytes */ #define SN_VTAG_PARAM_SIZE 12 /**< Size of SCTP ASCONF vtag param in bytes */ #define SN_ASCONFACK_PARAM_SIZE 8 /**< Size of SCTP ASCONF ACK param in bytes */ /* Packet parsing return codes */ #define SN_PARSE_OK 0 /**< Packet parsed for SCTP messages */ #define SN_PARSE_ERROR_IPSHL 1 /**< Packet parsing error - IP and SCTP common header len */ #define SN_PARSE_ERROR_AS_MALLOC 2 /**< Packet parsing error - assoc malloc */ #define SN_PARSE_ERROR_CHHL 3 /**< Packet parsing error - Chunk header len */ #define SN_PARSE_ERROR_DIR 4 /**< Packet parsing error - Direction */ #define SN_PARSE_ERROR_VTAG 5 /**< Packet parsing error - Vtag */ #define SN_PARSE_ERROR_CHUNK 6 /**< Packet parsing error - Chunk */ #define SN_PARSE_ERROR_PORT 7 /**< Packet parsing error - Port=0 */ #define SN_PARSE_ERROR_LOOKUP 8 /**< Packet parsing error - Lookup */ #define SN_PARSE_ERROR_PARTIALLOOKUP 9 /**< Packet parsing error - partial lookup only found */ #define SN_PARSE_ERROR_LOOKUP_ABORT 10 /**< Packet parsing error - Lookup - but abort packet */ /* Alias_sctp performs its processing based on a number of key messages */ #define SN_SCTP_ABORT 0x0000 /**< a packet containing an ABORT chunk */ #define SN_SCTP_INIT 0x0001 /**< a packet containing an INIT chunk */ #define SN_SCTP_INITACK 0x0002 /**< a packet containing an INIT-ACK chunk */ #define SN_SCTP_SHUTCOMP 0x0010 /**< a packet containing a SHUTDOWN-COMPLETE chunk */ #define SN_SCTP_SHUTACK 0x0020 /**< a packet containing a SHUTDOWN-ACK chunk */ #define SN_SCTP_ASCONF 0x0100 /**< a packet containing an ASCONF chunk */ #define SN_SCTP_ASCONFACK 0x0200 /**< a packet containing an ASCONF-ACK chunk */ #define SN_SCTP_OTHER 0xFFFF /**< a packet containing a chunk that is not of interest */ /** @} * @defgroup state_machine SCTP NAT State Machine * * Defines the various states an association can be within the NAT @{ */ #define SN_ID 0x0000 /**< Idle state */ #define SN_INi 0x0010 /**< Initialising, waiting for InitAck state */ #define SN_INa 0x0020 /**< Initialising, waiting for AddIpAck state */ #define SN_UP 0x0100 /**< Association in UP state */ #define SN_CL 0x1000 /**< Closing state */ #define SN_RM 0x2000 /**< Removing state */ /** @} * @defgroup Logging Logging Functionality * * Define various log levels and a macro to call specified log functions only if * the current log level (sysctl_log_level) matches the specified level @{ */ #define SN_LOG_LOW 0 #define SN_LOG_EVENT 1 #define SN_LOG_INFO 2 #define SN_LOG_DETAIL 3 #define SN_LOG_DEBUG 4 #define SN_LOG_DEBUG_MAX 5 #define SN_LOG(level, action) if (sysctl_log_level >= level) { action; } /**< Perform log action ONLY if the current log level meets the specified log level */ /** @} * @defgroup Hash Hash Table Macros and Functions * * Defines minimum/maximum/default values for the hash table size @{ */ #define SN_MIN_HASH_SIZE 101 /**< Minimum hash table size (set to stop users choosing stupid values) */ #define SN_MAX_HASH_SIZE 1000001 /**< Maximum hash table size (NB must be less than max int) */ #define SN_DEFAULT_HASH_SIZE 2003 /**< A reasonable default size for the hash tables */ #define SN_LOCAL_TBL 0x01 /**< assoc in local table */ #define SN_GLOBAL_TBL 0x02 /**< assoc in global table */ #define SN_BOTH_TBL 0x03 /**< assoc in both tables */ #define SN_WAIT_TOLOCAL 0x10 /**< assoc waiting for TOLOCAL asconf ACK*/ #define SN_WAIT_TOGLOBAL 0x20 /**< assoc waiting for TOLOCAL asconf ACK*/ #define SN_NULL_TBL 0x00 /**< assoc in No table */ #define SN_MAX_GLOBAL_ADDRESSES 100 /**< absolute maximum global address count*/ #define SN_ADD_OK 0 /**< Association added to the table */ #define SN_ADD_CLASH 1 /**< Clash when trying to add the assoc. info to the table */ #define SN_TABLE_HASH(vtag, port, size) (((u_int) vtag + (u_int) port) % (u_int) size) /**< Calculate the hash table lookup position */ /** @} * @defgroup Timer Timer Queue Macros and Functions * * Timer macros set minimum/maximum timeout values and calculate timer expiry * times for the provided libalias instance @{ */ #define SN_MIN_TIMER 1 #define SN_MAX_TIMER 600 #define SN_TIMER_QUEUE_SIZE SN_MAX_TIMER+2 #define SN_I_T(la) (la->timeStamp + sysctl_init_timer) /**< INIT State expiration time in seconds */ #define SN_U_T(la) (la->timeStamp + sysctl_up_timer) /**< UP State expiration time in seconds */ #define SN_C_T(la) (la->timeStamp + sysctl_shutdown_timer) /**< CL State expiration time in seconds */ #define SN_X_T(la) (la->timeStamp + sysctl_holddown_timer) /**< Wait after a shutdown complete in seconds */ /** @} * @defgroup sysctl SysCtl Variable and callback function declarations * * Sysctl variables to modify NAT functionality in real-time along with associated functions * to manage modifications to the sysctl variables @{ */ /* Callbacks */ int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS); int sysctl_chg_timer(SYSCTL_HANDLER_ARGS); int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS); int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS); int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS); int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS); int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS); int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS); int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS); /* Sysctl variables */ /** @brief net.inet.ip.alias.sctp.log_level */ static u_int sysctl_log_level = 0; /**< Stores the current level of logging */ /** @brief net.inet.ip.alias.sctp.init_timer */ static u_int sysctl_init_timer = 15; /**< Seconds to hold an association in the table waiting for an INIT-ACK or AddIP-ACK */ /** @brief net.inet.ip.alias.sctp.up_timer */ static u_int sysctl_up_timer = 300; /**< Seconds to hold an association in the table while no packets are transmitted */ /** @brief net.inet.ip.alias.sctp.shutdown_timer */ static u_int sysctl_shutdown_timer = 15; /**< Seconds to hold an association in the table waiting for a SHUTDOWN-COMPLETE */ /** @brief net.inet.ip.alias.sctp.holddown_timer */ static u_int sysctl_holddown_timer = 0; /**< Seconds to hold an association in the table after it has been shutdown (to allow for lost SHUTDOWN-COMPLETEs) */ /** @brief net.inet.ip.alias.sctp.hashtable_size */ static u_int sysctl_hashtable_size = SN_DEFAULT_HASH_SIZE; /**< Sets the hash table size for any NEW NAT instances (existing instances retain their existing Hash Table */ /** @brief net.inet.ip.alias.sctp.error_on_ootb */ static u_int sysctl_error_on_ootb = 1; /**< NAT response to receipt of OOTB packet (0 - No response, 1 - NAT will send ErrorM only to local side, 2 - NAT will send local ErrorM and global ErrorM if there was a partial association match 3 - NAT will send ErrorM to both local and global) */ /** @brief net.inet.ip.alias.sctp.accept_global_ootb_addip */ static u_int sysctl_accept_global_ootb_addip = 0; /** 0 - enables tracking but limits the number of global IP addresses to this value) If set to >=1 the NAT will track that many global IP addresses. This may reduce look up table conflicts, but increases processing */ #define SN_NO_ERROR_ON_OOTB 0 /**< Send no errorM on out of the blue packets */ #define SN_LOCAL_ERROR_ON_OOTB 1 /**< Send only local errorM on out of the blue packets */ #define SN_LOCALandPARTIAL_ERROR_ON_OOTB 2 /**< Send local errorM and global errorM for out of the blue packets only if partial match found */ #define SN_ERROR_ON_OOTB 3 /**< Send errorM on out of the blue packets */ #ifdef SYSCTL_NODE SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_ip_alias); static SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, "SCTP NAT"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_log_level, 0, sysctl_chg_loglevel, "IU", "Level of detail (0 - default, 1 - event, 2 - info, 3 - detail, 4 - debug, 5 - max debug)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, init_timer, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_init_timer, 0, sysctl_chg_timer, "IU", "Timeout value (s) while waiting for (INIT-ACK|AddIP-ACK)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, up_timer, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_up_timer, 0, sysctl_chg_timer, "IU", "Timeout value (s) to keep an association up with no traffic"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, shutdown_timer, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_shutdown_timer, 0, sysctl_chg_timer, "IU", "Timeout value (s) while waiting for SHUTDOWN-COMPLETE"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, holddown_timer, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_holddown_timer, 0, sysctl_chg_timer, "IU", "Hold association in table for this many seconds after receiving a SHUTDOWN-COMPLETE"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, hashtable_size, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_hashtable_size, 0, sysctl_chg_hashtable_size, "IU", "Size of hash tables used for NAT lookups (100 < prime_number > 1000001)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, error_on_ootb, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_error_on_ootb, 0, sysctl_chg_error_on_ootb, "IU", "ErrorM sent on receipt of ootb packet:\n\t0 - none,\n\t1 - to local only,\n\t2 - to local and global if a partial association match,\n\t3 - to local and global (DoS risk)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, accept_global_ootb_addip, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_accept_global_ootb_addip, 0, sysctl_chg_accept_global_ootb_addip, "IU", "NAT response to receipt of global OOTB AddIP:\n\t0 - No response,\n\t1 - NAT will accept OOTB global AddIP messages for processing (Security risk)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, initialising_chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_initialising_chunk_proc_limit, 0, sysctl_chg_initialising_chunk_proc_limit, "IU", "Number of chunks that should be processed if there is no current association found:\n\t > 0 (A high value is a DoS risk)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_chunk_proc_limit, 0, sysctl_chg_chunk_proc_limit, "IU", "Number of chunks that should be processed to find key chunk:\n\t>= initialising_chunk_proc_limit (A high value is a DoS risk)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, param_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_param_proc_limit, 0, sysctl_chg_param_proc_limit, "IU", "Number of parameters (in a chunk) that should be processed to find key parameters:\n\t> 1 (A high value is a DoS risk)"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, track_global_addresses, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_track_global_addresses, 0, sysctl_chg_track_global_addresses, "IU", "Configures the global address tracking option within the NAT:\n\t0 - Global tracking is disabled,\n\t> 0 - enables tracking but limits the number of global IP addresses to this value"); #endif /* SYSCTL_NODE */ /** @} * @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.fw.sctp.log_level * * Updates the variable sysctl_log_level to the provided value and ensures * it is in the valid range (SN_LOG_LOW -> SN_LOG_DEBUG) */ int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS) { u_int level = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &level, 0, req); if (error) return (error); sysctl_log_level = (level > SN_LOG_DEBUG_MAX)?(SN_LOG_DEBUG_MAX):(level); sysctl_log_level = (level < SN_LOG_LOW)?(SN_LOG_LOW):(level); return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.fw.sctp.(init_timer|up_timer|shutdown_timer) * * Updates the timer-based sysctl variables. The new values are sanity-checked * to make sure that they are within the range SN_MIN_TIMER-SN_MAX_TIMER. The * holddown timer is allowed to be 0 */ int sysctl_chg_timer(SYSCTL_HANDLER_ARGS) { u_int timer = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &timer, 0, req); if (error) return (error); timer = (timer > SN_MAX_TIMER)?(SN_MAX_TIMER):(timer); if (((u_int *)arg1) != &sysctl_holddown_timer) { timer = (timer < SN_MIN_TIMER)?(SN_MIN_TIMER):(timer); } *(u_int *)arg1 = timer; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.hashtable_size * * Updates the hashtable_size sysctl variable. The new value should be a prime * number. We sanity check to ensure that the size is within the range * SN_MIN_HASH_SIZE-SN_MAX_HASH_SIZE. We then check the provided number to see * if it is prime. We approximate by checking that (2,3,5,7,11) are not factors, * incrementing the user provided value until we find a suitable number. */ int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS) { u_int size = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &size, 0, req); if (error) return (error); size = (size < SN_MIN_HASH_SIZE)?(SN_MIN_HASH_SIZE):((size > SN_MAX_HASH_SIZE)?(SN_MAX_HASH_SIZE):(size)); size |= 0x00000001; /* make odd */ for(;(((size % 3) == 0) || ((size % 5) == 0) || ((size % 7) == 0) || ((size % 11) == 0)); size+=2); sysctl_hashtable_size = size; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.error_on_ootb * * Updates the error_on_clash sysctl variable. * If set to 0, no ErrorM will be sent if there is a look up table clash * If set to 1, an ErrorM is sent only to the local side * If set to 2, an ErrorM is sent to the local side and global side if there is * a partial association match * If set to 3, an ErrorM is sent to both local and global sides (DoS) risk. */ int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS) { u_int flag = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &flag, 0, req); if (error) return (error); sysctl_error_on_ootb = (flag > SN_ERROR_ON_OOTB) ? SN_ERROR_ON_OOTB: flag; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.accept_global_ootb_addip * * If set to 1 the NAT will accept ootb global addip messages for processing (Security risk) * Default is 0, only responding to local ootb AddIP messages */ int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS) { u_int flag = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &flag, 0, req); if (error) return (error); sysctl_accept_global_ootb_addip = (flag == 1) ? 1: 0; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.initialising_chunk_proc_limit * * Updates the initialising_chunk_proc_limit sysctl variable. Number of chunks * that should be processed if there is no current association found: > 0 (A * high value is a DoS risk) */ int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS) { u_int proclimit = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &proclimit, 0, req); if (error) return (error); sysctl_initialising_chunk_proc_limit = (proclimit < 1) ? 1: proclimit; sysctl_chunk_proc_limit = (sysctl_chunk_proc_limit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : sysctl_chunk_proc_limit; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.chunk_proc_limit * * Updates the chunk_proc_limit sysctl variable. * Number of chunks that should be processed to find key chunk: * >= initialising_chunk_proc_limit (A high value is a DoS risk) */ int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS) { u_int proclimit = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &proclimit, 0, req); if (error) return (error); sysctl_chunk_proc_limit = (proclimit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : proclimit; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.param_proc_limit * * Updates the param_proc_limit sysctl variable. * Number of parameters that should be processed to find key parameters: * > 1 (A high value is a DoS risk) */ int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS) { u_int proclimit = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &proclimit, 0, req); if (error) return (error); sysctl_param_proc_limit = (proclimit < 2) ? 2 : proclimit; return (0); } /** @ingroup sysctl * @brief sysctl callback for changing net.inet.ip.alias.sctp.track_global_addresses * *Configures the global address tracking option within the NAT (0 - Global *tracking is disabled, > 0 - enables tracking but limits the number of global *IP addresses to this value) */ int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS) { u_int num_to_track = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &num_to_track, 0, req); if (error) return (error); sysctl_track_global_addresses = (num_to_track > SN_MAX_GLOBAL_ADDRESSES) ? SN_MAX_GLOBAL_ADDRESSES : num_to_track; return (0); } /* ---------------------------------------------------------------------- * CODE BEGINS HERE * ---------------------------------------------------------------------- */ /** * @brief Initialises the SCTP NAT Implementation * * Creates the look-up tables and the timer queue and initialises all state * variables * * @param la Pointer to the relevant libalias instance */ void AliasSctpInit(struct libalias *la) { /* Initialise association tables*/ int i; la->sctpNatTableSize = sysctl_hashtable_size; SN_LOG(SN_LOG_EVENT, SctpAliasLog("Initialising SCTP NAT Instance (hash_table_size:%d)\n", la->sctpNatTableSize)); la->sctpTableLocal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableL)); la->sctpTableGlobal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableG)); la->sctpNatTimer.TimerQ = sn_calloc(SN_TIMER_QUEUE_SIZE, sizeof(struct sctpTimerQ)); /* Initialise hash table */ for (i = 0; i < la->sctpNatTableSize; i++) { LIST_INIT(&la->sctpTableLocal[i]); LIST_INIT(&la->sctpTableGlobal[i]); } /* Initialise circular timer Q*/ for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++) LIST_INIT(&la->sctpNatTimer.TimerQ[i]); #ifdef _KERNEL la->sctpNatTimer.loc_time=time_uptime; /* la->timeStamp is not set yet */ #else la->sctpNatTimer.loc_time=la->timeStamp; #endif la->sctpNatTimer.cur_loc = 0; la->sctpLinkCount = 0; } /** * @brief Cleans-up the SCTP NAT Implementation prior to unloading * * Removes all entries from the timer queue, freeing associations as it goes. * We then free memory allocated to the look-up tables and the time queue * * NOTE: We do not need to traverse the look-up tables as each association * will always have an entry in the timer queue, freeing this memory * once will free all memory allocated to entries in the look-up tables * * @param la Pointer to the relevant libalias instance */ void AliasSctpTerm(struct libalias *la) { struct sctp_nat_assoc *assoc1, *assoc2; int i; LIBALIAS_LOCK_ASSERT(la); SN_LOG(SN_LOG_EVENT, SctpAliasLog("Removing SCTP NAT Instance\n")); for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++) { assoc1 = LIST_FIRST(&la->sctpNatTimer.TimerQ[i]); while (assoc1 != NULL) { freeGlobalAddressList(assoc1); assoc2 = LIST_NEXT(assoc1, timer_Q); sn_free(assoc1); assoc1 = assoc2; } } sn_free(la->sctpTableLocal); sn_free(la->sctpTableGlobal); sn_free(la->sctpNatTimer.TimerQ); } /** * @brief Handles SCTP packets passed from libalias * * This function needs to actually NAT/drop packets and possibly create and * send AbortM or ErrorM packets in response. The process involves: * - Validating the direction parameter passed by the caller * - Checking and handling any expired timers for the NAT * - Calling sctp_PktParser() to parse the packet * - Call ProcessSctpMsg() to decide the appropriate outcome and to update * the NAT tables * - Based on the return code either: * - NAT the packet * - Construct and send an ErrorM|AbortM packet * - Mark the association for removal from the tables * - Potentially remove the association from all lookup tables * - Return the appropriate result to libalias * * @param la Pointer to the relevant libalias instance * @param pip Pointer to IP packet to process * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * * @return PKT_ALIAS_OK | PKT_ALIAS_IGNORE | PKT_ALIAS_ERROR */ int SctpAlias(struct libalias *la, struct ip *pip, int direction) { int rtnval; struct sctp_nat_msg msg; struct sctp_nat_assoc *assoc = NULL; if ((direction != SN_TO_LOCAL) && (direction != SN_TO_GLOBAL)) { SctpAliasLog("ERROR: Invalid direction\n"); return(PKT_ALIAS_ERROR); } sctp_CheckTimers(la); /* Check timers */ /* Parse the packet */ rtnval = sctp_PktParser(la, direction, pip, &msg, &assoc); //using *char (change to mbuf when get code from paolo) switch (rtnval) { case SN_PARSE_OK: break; case SN_PARSE_ERROR_CHHL: /* Not an error if there is a chunk length parsing error and this is a fragmented packet */ if (ntohs(pip->ip_off) & IP_MF) { rtnval = SN_PARSE_OK; break; } SN_LOG(SN_LOG_EVENT, logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); return(PKT_ALIAS_ERROR); case SN_PARSE_ERROR_PARTIALLOOKUP: if (sysctl_error_on_ootb > SN_LOCALandPARTIAL_ERROR_ON_OOTB) { SN_LOG(SN_LOG_EVENT, logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); return(PKT_ALIAS_ERROR); } case SN_PARSE_ERROR_LOOKUP: if (sysctl_error_on_ootb == SN_ERROR_ON_OOTB || (sysctl_error_on_ootb == SN_LOCALandPARTIAL_ERROR_ON_OOTB && direction == SN_TO_LOCAL) || (sysctl_error_on_ootb == SN_LOCAL_ERROR_ON_OOTB && direction == SN_TO_GLOBAL)) { TxAbortErrorM(la, &msg, assoc, SN_REFLECT_ERROR, direction); /*NB assoc=NULL */ return(PKT_ALIAS_RESPOND); } default: SN_LOG(SN_LOG_EVENT, logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); return(PKT_ALIAS_ERROR); } SN_LOG(SN_LOG_DETAIL, logsctpassoc(assoc, "*"); logsctpparse(direction, &msg); ); /* Process the SCTP message */ rtnval = ProcessSctpMsg(la, direction, &msg, assoc); SN_LOG(SN_LOG_DEBUG_MAX, logsctpassoc(assoc, "-"); logSctpLocal(la); logSctpGlobal(la); ); SN_LOG(SN_LOG_DEBUG, logTimerQ(la)); switch(rtnval){ case SN_NAT_PKT: switch(direction) { case SN_TO_LOCAL: DifferentialChecksum(&(msg.ip_hdr->ip_sum), &(assoc->l_addr), &(msg.ip_hdr->ip_dst), 2); msg.ip_hdr->ip_dst = assoc->l_addr; /* change dst address to local address*/ break; case SN_TO_GLOBAL: DifferentialChecksum(&(msg.ip_hdr->ip_sum), &(assoc->a_addr), &(msg.ip_hdr->ip_src), 2); msg.ip_hdr->ip_src = assoc->a_addr; /* change src to alias addr*/ break; default: rtnval = SN_DROP_PKT; /* shouldn't get here, but if it does drop packet */ SN_LOG(SN_LOG_LOW, logsctperror("ERROR: Invalid direction", msg.sctp_hdr->v_tag, rtnval, direction)); break; } break; case SN_DROP_PKT: SN_LOG(SN_LOG_DETAIL, logsctperror("SN_DROP_PKT", msg.sctp_hdr->v_tag, rtnval, direction)); break; case SN_REPLY_ABORT: case SN_REPLY_ERROR: case SN_SEND_ABORT: TxAbortErrorM(la, &msg, assoc, rtnval, direction); break; default: // big error, remove association and go to idle and write log messages SN_LOG(SN_LOG_LOW, logsctperror("SN_PROCESSING_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); assoc->state=SN_RM;/* Mark for removal*/ break; } /* Remove association if tagged for removal */ if (assoc->state == SN_RM) { if (assoc->TableRegister) { sctp_RmTimeOut(la, assoc); RmSctpAssoc(la, assoc); } LIBALIAS_LOCK_ASSERT(la); freeGlobalAddressList(assoc); sn_free(assoc); } switch(rtnval) { case SN_NAT_PKT: return(PKT_ALIAS_OK); case SN_SEND_ABORT: return(PKT_ALIAS_OK); case SN_REPLY_ABORT: case SN_REPLY_ERROR: case SN_REFLECT_ERROR: return(PKT_ALIAS_RESPOND); case SN_DROP_PKT: default: return(PKT_ALIAS_ERROR); } } /** * @brief Send an AbortM or ErrorM * * We construct the new SCTP packet to send in place of the existing packet we * have been asked to NAT. This function can only be called if the original * packet was successfully parsed as a valid SCTP packet. * * An AbortM (without cause) packet is the smallest SCTP packet available and as * such there is always space in the existing packet buffer to fit the AbortM * packet. An ErrorM packet is 4 bytes longer than the (the error cause is not * optional). An ErrorM is sent in response to an AddIP when the Vtag/address * combination, if added, will produce a conflict in the association look up * tables. It may also be used for an unexpected packet - a packet with no * matching association in the NAT table and we are requesting an AddIP so we * can add it. The smallest valid SCTP packet while the association is in an * up-state is a Heartbeat packet, which is big enough to be transformed to an * ErrorM. * * We create a temporary character array to store the packet as we are constructing * it. We then populate the array with appropriate values based on: * - Packet type (AbortM | ErrorM) * - Initial packet direction (SN_TO_LOCAL | SN_TO_GLOBAL) * - NAT response (Send packet | Reply packet) * * Once complete, we copy the contents of the temporary packet over the original * SCTP packet we were asked to NAT * * @param la Pointer to the relevant libalias instance * @param sm Pointer to sctp message information * @param assoc Pointer to current association details * @param sndrply SN_SEND_ABORT | SN_REPLY_ABORT | SN_REPLY_ERROR * @param direction SN_TO_LOCAL | SN_TO_GLOBAL */ static uint32_t local_sctp_finalize_crc32(uint32_t crc32c) { /* This routine is duplicated from SCTP * we need to do that since it MAY be that SCTP * is NOT compiled into the kernel. The CRC32C routines * however are always available in libkern. */ uint32_t result; #if BYTE_ORDER == BIG_ENDIAN uint8_t byte0, byte1, byte2, byte3; #endif /* Complement the result */ result = ~crc32c; #if BYTE_ORDER == BIG_ENDIAN /* * For BIG-ENDIAN.. aka Motorola byte order the result is in * little-endian form. So we must manually swap the bytes. Then we * can call htonl() which does nothing... */ byte0 = result & 0x000000ff; byte1 = (result >> 8) & 0x000000ff; byte2 = (result >> 16) & 0x000000ff; byte3 = (result >> 24) & 0x000000ff; crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3); #else /* * For INTEL platforms the result comes out in network order. No * htonl is required or the swap above. So we optimize out both the * htonl and the manual swap above. */ crc32c = result; #endif return (crc32c); } static void TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int sndrply, int direction) { int sctp_size = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_error_cause); int ip_size = sizeof(struct ip) + sctp_size; int include_error_cause = 1; char tmp_ip[ip_size]; + char addrbuf[INET_ADDRSTRLEN]; if (ntohs(sm->ip_hdr->ip_len) < ip_size) { /* short packet, cannot send error cause */ include_error_cause = 0; ip_size = ip_size - sizeof(struct sctp_error_cause); sctp_size = sctp_size - sizeof(struct sctp_error_cause); } /* Assign header pointers packet */ struct ip* ip = (struct ip *) tmp_ip; struct sctphdr* sctp_hdr = (struct sctphdr *) ((char *) ip + sizeof(*ip)); struct sctp_chunkhdr* chunk_hdr = (struct sctp_chunkhdr *) ((char *) sctp_hdr + sizeof(*sctp_hdr)); struct sctp_error_cause* error_cause = (struct sctp_error_cause *) ((char *) chunk_hdr + sizeof(*chunk_hdr)); /* construct ip header */ ip->ip_v = sm->ip_hdr->ip_v; ip->ip_hl = 5; /* 5*32 bit words */ ip->ip_tos = 0; ip->ip_len = htons(ip_size); ip->ip_id = sm->ip_hdr->ip_id; ip->ip_off = 0; ip->ip_ttl = 255; ip->ip_p = IPPROTO_SCTP; /* The definitions below should be removed when they make it into the SCTP stack */ #define SCTP_MIDDLEBOX_FLAG 0x02 #define SCTP_NAT_TABLE_COLLISION 0x00b0 #define SCTP_MISSING_NAT 0x00b1 chunk_hdr->chunk_type = (sndrply & SN_TX_ABORT) ? SCTP_ABORT_ASSOCIATION : SCTP_OPERATION_ERROR; chunk_hdr->chunk_flags = SCTP_MIDDLEBOX_FLAG; if (include_error_cause) { error_cause->code = htons((sndrply & SN_REFLECT_ERROR) ? SCTP_MISSING_NAT : SCTP_NAT_TABLE_COLLISION); error_cause->length = htons(sizeof(struct sctp_error_cause)); chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr) + sizeof(struct sctp_error_cause)); } else { chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr)); } /* set specific values */ switch(sndrply) { case SN_REFLECT_ERROR: chunk_hdr->chunk_flags |= SCTP_HAD_NO_TCB; /* set Tbit */ sctp_hdr->v_tag = sm->sctp_hdr->v_tag; break; case SN_REPLY_ERROR: sctp_hdr->v_tag = (direction == SN_TO_LOCAL) ? assoc->g_vtag : assoc->l_vtag ; break; case SN_SEND_ABORT: sctp_hdr->v_tag = sm->sctp_hdr->v_tag; break; case SN_REPLY_ABORT: sctp_hdr->v_tag = sm->sctpchnk.Init->initiate_tag; break; } /* Set send/reply values */ if (sndrply == SN_SEND_ABORT) { /*pass through NAT */ ip->ip_src = (direction == SN_TO_LOCAL) ? sm->ip_hdr->ip_src : assoc->a_addr; ip->ip_dst = (direction == SN_TO_LOCAL) ? assoc->l_addr : sm->ip_hdr->ip_dst; sctp_hdr->src_port = sm->sctp_hdr->src_port; sctp_hdr->dest_port = sm->sctp_hdr->dest_port; } else { /* reply and reflect */ ip->ip_src = sm->ip_hdr->ip_dst; ip->ip_dst = sm->ip_hdr->ip_src; sctp_hdr->src_port = sm->sctp_hdr->dest_port; sctp_hdr->dest_port = sm->sctp_hdr->src_port; } /* Calculate IP header checksum */ ip->ip_sum = in_cksum_hdr(ip); /* calculate SCTP header CRC32 */ sctp_hdr->checksum = 0; sctp_hdr->checksum = local_sctp_finalize_crc32(calculate_crc32c(0xffffffff, (unsigned char *) sctp_hdr, sctp_size)); memcpy(sm->ip_hdr, ip, ip_size); SN_LOG(SN_LOG_EVENT,SctpAliasLog("%s %s 0x%x (->%s:%u vtag=0x%x crc=0x%x)\n", ((sndrply == SN_SEND_ABORT) ? "Sending" : "Replying"), ((sndrply & SN_TX_ERROR) ? "ErrorM" : "AbortM"), (include_error_cause ? ntohs(error_cause->code) : 0), - inet_ntoa(ip->ip_dst),ntohs(sctp_hdr->dest_port), + inet_ntoa_r(ip->ip_dst, INET_NTOA_BUF(addrbuf)), + ntohs(sctp_hdr->dest_port), ntohl(sctp_hdr->v_tag), ntohl(sctp_hdr->checksum))); } /* ---------------------------------------------------------------------- * PACKET PARSER CODE * ---------------------------------------------------------------------- */ /** @addtogroup packet_parser * * These functions parse the SCTP packet and fill a sctp_nat_msg structure * with the parsed contents. */ /** @ingroup packet_parser * @brief Parses SCTP packets for the key SCTP chunk that will be processed * * This module parses SCTP packets for the key SCTP chunk that will be processed * The module completes the sctp_nat_msg structure and either retrieves the * relevant (existing) stored association from the Hash Tables or creates a new * association entity with state SN_ID * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param pip * @param sm Pointer to sctp message information * @param passoc Pointer to the association this SCTP Message belongs to * * @return SN_PARSE_OK | SN_PARSE_ERROR_* */ static int sctp_PktParser(struct libalias *la, int direction, struct ip *pip, struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc) //sctp_PktParser(int direction, struct mbuf *ipak, int ip_hdr_len,struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc) { struct sctphdr *sctp_hdr; struct sctp_chunkhdr *chunk_hdr; struct sctp_paramhdr *param_hdr; struct in_addr ipv4addr; int bytes_left; /* bytes left in ip packet */ int chunk_length; int chunk_count; int partial_match = 0; // mbuf *mp; // int mlen; // mlen = SCTP_HEADER_LEN(i_pak); // mp = SCTP_HEADER_TO_CHAIN(i_pak); /* does nothing in bsd since header and chain not separate */ /* * Note, that if the VTag is zero, it must be an INIT * Also, I am only interested in the content of INIT and ADDIP chunks */ // no mbuf stuff from Paolo yet so ... sm->ip_hdr = pip; /* remove ip header length from the bytes_left */ bytes_left = ntohs(pip->ip_len) - (pip->ip_hl << 2); /* Check SCTP header length and move to first chunk */ if (bytes_left < sizeof(struct sctphdr)) { sm->sctp_hdr = NULL; return(SN_PARSE_ERROR_IPSHL); /* packet not long enough*/ } sm->sctp_hdr = sctp_hdr = (struct sctphdr *) ip_next(pip); bytes_left -= sizeof(struct sctphdr); /* Check for valid ports (zero valued ports would find partially initialised associations */ if (sctp_hdr->src_port == 0 || sctp_hdr->dest_port == 0) return(SN_PARSE_ERROR_PORT); /* Check length of first chunk */ if (bytes_left < SN_MIN_CHUNK_SIZE) /* malformed chunk - could cause endless loop*/ return(SN_PARSE_ERROR_CHHL); /* packet not long enough for this chunk */ /* First chunk */ chunk_hdr = SN_SCTP_FIRSTCHUNK(sctp_hdr); chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length)); if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left)) /* malformed chunk - could cause endless loop*/ return(SN_PARSE_ERROR_CHHL); if ((chunk_hdr->chunk_flags & SCTP_HAD_NO_TCB) && ((chunk_hdr->chunk_type == SCTP_ABORT_ASSOCIATION) || (chunk_hdr->chunk_type == SCTP_SHUTDOWN_COMPLETE))) { /* T-Bit set */ if (direction == SN_TO_LOCAL) *passoc = FindSctpGlobalT(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port); else *passoc = FindSctpLocalT(la, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port); } else { /* Proper v_tag settings */ if (direction == SN_TO_LOCAL) *passoc = FindSctpGlobal(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match); else *passoc = FindSctpLocal(la, pip->ip_src, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port); } chunk_count = 1; /* Real packet parsing occurs below */ sm->msg = SN_SCTP_OTHER;/* Initialise to largest value*/ sm->chunk_length = 0; /* only care about length for key chunks */ while (IS_SCTP_CONTROL(chunk_hdr)) { switch(chunk_hdr->chunk_type) { case SCTP_INITIATION: if (chunk_length < sizeof(struct sctp_init_chunk)) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); sm->msg = SN_SCTP_INIT; sm->sctpchnk.Init = (struct sctp_init *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr)); sm->chunk_length = chunk_length; /* if no existing association, create a new one */ if (*passoc == NULL) { if (sctp_hdr->v_tag == 0){ //Init requires vtag=0 *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc)); if (*passoc == NULL) {/* out of resources */ return(SN_PARSE_ERROR_AS_MALLOC); } /* Initialise association - malloc initialises memory to zeros */ (*passoc)->state = SN_ID; LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */ (*passoc)->TableRegister = SN_NULL_TBL; return(SN_PARSE_OK); } return(SN_PARSE_ERROR_VTAG); } return(SN_PARSE_ERROR_LOOKUP); case SCTP_INITIATION_ACK: if (chunk_length < sizeof(struct sctp_init_ack_chunk)) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); sm->msg = SN_SCTP_INITACK; sm->sctpchnk.InitAck = (struct sctp_init_ack *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr)); sm->chunk_length = chunk_length; return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK)); case SCTP_ABORT_ASSOCIATION: /* access only minimum sized chunk */ sm->msg = SN_SCTP_ABORT; sm->chunk_length = chunk_length; return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP_ABORT):(SN_PARSE_OK)); case SCTP_SHUTDOWN_ACK: if (chunk_length < sizeof(struct sctp_shutdown_ack_chunk)) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); if (sm->msg > SN_SCTP_SHUTACK) { sm->msg = SN_SCTP_SHUTACK; sm->chunk_length = chunk_length; } break; case SCTP_SHUTDOWN_COMPLETE: /* minimum sized chunk */ if (sm->msg > SN_SCTP_SHUTCOMP) { sm->msg = SN_SCTP_SHUTCOMP; sm->chunk_length = chunk_length; } return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK)); case SCTP_ASCONF: if (sm->msg > SN_SCTP_ASCONF) { if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv4addr_param))) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); //leave parameter searching to later, if required param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr + sizeof(struct sctp_asconf_chunk)); /*compulsory IP parameter*/ if (ntohs(param_hdr->param_type) == SCTP_IPV4_ADDRESS) { if ((*passoc == NULL) && (direction == SN_TO_LOCAL)) { /* AddIP with no association */ /* try look up with the ASCONF packet's alternative address */ ipv4addr.s_addr = ((struct sctp_ipv4addr_param *) param_hdr)->addr; *passoc = FindSctpGlobal(la, ipv4addr, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match); } param_hdr = (struct sctp_paramhdr *) ((char *) param_hdr + sizeof(struct sctp_ipv4addr_param)); /*asconf's compulsory address parameter */ sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv4addr_param); /* rest of chunk */ } else { if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv6addr_param))) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); param_hdr = (struct sctp_paramhdr *) ((char *) param_hdr + sizeof(struct sctp_ipv6addr_param)); /*asconf's compulsory address parameter */ sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv6addr_param); /* rest of chunk */ } sm->msg = SN_SCTP_ASCONF; sm->sctpchnk.Asconf = param_hdr; if (*passoc == NULL) { /* AddIP with no association */ *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc)); if (*passoc == NULL) {/* out of resources */ return(SN_PARSE_ERROR_AS_MALLOC); } /* Initialise association - malloc initialises memory to zeros */ (*passoc)->state = SN_ID; LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */ (*passoc)->TableRegister = SN_NULL_TBL; return(SN_PARSE_OK); } } break; case SCTP_ASCONF_ACK: if (sm->msg > SN_SCTP_ASCONFACK) { if (chunk_length < sizeof(struct sctp_asconf_ack_chunk)) /* malformed chunk*/ return(SN_PARSE_ERROR_CHHL); //leave parameter searching to later, if required param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr + sizeof(struct sctp_asconf_ack_chunk)); sm->msg = SN_SCTP_ASCONFACK; sm->sctpchnk.Asconf = param_hdr; sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_ack_chunk); } break; default: break; /* do nothing*/ } /* if no association is found exit - we need to find an Init or AddIP within sysctl_initialising_chunk_proc_limit */ if ((*passoc == NULL) && (chunk_count >= sysctl_initialising_chunk_proc_limit)) return(SN_PARSE_ERROR_LOOKUP); /* finished with this chunk, on to the next chunk*/ bytes_left-= chunk_length; /* Is this the end of the packet ? */ if (bytes_left == 0) return (*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK); /* Are there enough bytes in packet to at least retrieve length of next chunk ? */ if (bytes_left < SN_MIN_CHUNK_SIZE) return(SN_PARSE_ERROR_CHHL); chunk_hdr = SN_SCTP_NEXTCHUNK(chunk_hdr); /* Is the chunk long enough to not cause endless look and are there enough bytes in packet to read the chunk ? */ chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length)); if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left)) return(SN_PARSE_ERROR_CHHL); if(++chunk_count > sysctl_chunk_proc_limit) return(SN_PARSE_OK); /* limit for processing chunks, take what we get */ } if (*passoc == NULL) return (partial_match)?(SN_PARSE_ERROR_PARTIALLOOKUP):(SN_PARSE_ERROR_LOOKUP); else return(SN_PARSE_OK); } /** @ingroup packet_parser * @brief Extract Vtags from Asconf Chunk * * GetAsconfVtags scans an Asconf Chunk for the vtags parameter, and then * extracts the vtags. * * GetAsconfVtags is not called from within sctp_PktParser. It is called only * from within ID_process when an AddIP has been received. * * @param la Pointer to the relevant libalias instance * @param sm Pointer to sctp message information * @param l_vtag Pointer to the local vtag in the association this SCTP Message belongs to * @param g_vtag Pointer to the local vtag in the association this SCTP Message belongs to * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * * @return 1 - success | 0 - fail */ static int GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm, uint32_t *l_vtag, uint32_t *g_vtag, int direction) { /* To be removed when information is in the sctp headers */ #define SCTP_VTAG_PARAM 0xC007 struct sctp_vtag_param { struct sctp_paramhdr ph;/* type=SCTP_VTAG_PARAM */ uint32_t local_vtag; uint32_t remote_vtag; } __attribute__((packed)); struct sctp_vtag_param *vtag_param; struct sctp_paramhdr *param; int bytes_left; int param_size; int param_count; param_count = 1; param = sm->sctpchnk.Asconf; param_size = SCTP_SIZE32(ntohs(param->param_length)); bytes_left = sm->chunk_length; /* step through Asconf parameters */ while((bytes_left >= param_size) && (bytes_left >= SN_VTAG_PARAM_SIZE)) { if (ntohs(param->param_type) == SCTP_VTAG_PARAM) { vtag_param = (struct sctp_vtag_param *) param; switch(direction) { /* The Internet draft is a little ambigious as to order of these vtags. We think it is this way around. If we are wrong, the order will need to be changed. */ case SN_TO_GLOBAL: *g_vtag = vtag_param->local_vtag; *l_vtag = vtag_param->remote_vtag; break; case SN_TO_LOCAL: *g_vtag = vtag_param->remote_vtag; *l_vtag = vtag_param->local_vtag; break; } return(1); /* found */ } bytes_left -= param_size; if (bytes_left < SN_MIN_PARAM_SIZE) return(0); param = SN_SCTP_NEXTPARAM(param); param_size = SCTP_SIZE32(ntohs(param->param_length)); if (++param_count > sysctl_param_proc_limit) { SN_LOG(SN_LOG_EVENT, logsctperror("Parameter parse limit exceeded (GetAsconfVtags)", sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); return(0); /* not found limit exceeded*/ } } return(0); /* not found */ } /** @ingroup packet_parser * @brief AddGlobalIPAddresses from Init,InitAck,or AddIP packets * * AddGlobalIPAddresses scans an SCTP chunk (in sm) for Global IP addresses, and * adds them. * * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * */ static void AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction) { struct sctp_ipv4addr_param *ipv4_param; struct sctp_paramhdr *param = NULL; struct sctp_GlobalAddress *G_Addr; struct in_addr g_addr = {0}; int bytes_left = 0; int param_size; int param_count, addr_param_count = 0; switch(direction) { case SN_TO_GLOBAL: /* does not contain global addresses */ g_addr = sm->ip_hdr->ip_dst; bytes_left = 0; /* force exit */ break; case SN_TO_LOCAL: g_addr = sm->ip_hdr->ip_src; param_count = 1; switch(sm->msg) { case SN_SCTP_INIT: bytes_left = sm->chunk_length - sizeof(struct sctp_init_chunk); param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.Init + sizeof(struct sctp_init)); break; case SN_SCTP_INITACK: bytes_left = sm->chunk_length - sizeof(struct sctp_init_ack_chunk); param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.InitAck + sizeof(struct sctp_init_ack)); break; case SN_SCTP_ASCONF: bytes_left = sm->chunk_length; param = sm->sctpchnk.Asconf; break; } } if (bytes_left >= SN_MIN_PARAM_SIZE) param_size = SCTP_SIZE32(ntohs(param->param_length)); else param_size = bytes_left+1; /* force skip loop */ if ((assoc->state == SN_ID) && ((sm->msg == SN_SCTP_INIT) || (bytes_left < SN_MIN_PARAM_SIZE))) {/* add pkt address */ G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress)); if (G_Addr == NULL) {/* out of resources */ SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking", sm->sctp_hdr->v_tag, 0, direction)); assoc->num_Gaddr = 0; /* don't track any more for this assoc*/ sysctl_track_global_addresses=0; return; } G_Addr->g_addr = g_addr; if (!Add_Global_Address_to_List(assoc, G_Addr)) SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: Address already in list", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); } /* step through parameters */ while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) { if (assoc->num_Gaddr >= sysctl_track_global_addresses) { SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: Maximum Number of addresses reached", sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); return; } switch(ntohs(param->param_type)) { case SCTP_ADD_IP_ADDRESS: /* skip to address parameter - leave param_size so bytes left will be calculated properly*/ param = (struct sctp_paramhdr *) &((struct sctp_asconf_addrv4_param *) param)->addrp; case SCTP_IPV4_ADDRESS: ipv4_param = (struct sctp_ipv4addr_param *) param; /* add addresses to association */ G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress)); if (G_Addr == NULL) {/* out of resources */ SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking", sm->sctp_hdr->v_tag, 0, direction)); assoc->num_Gaddr = 0; /* don't track any more for this assoc*/ sysctl_track_global_addresses=0; return; } /* add address */ addr_param_count++; if ((sm->msg == SN_SCTP_ASCONF) && (ipv4_param->addr == INADDR_ANY)) { /* use packet address */ G_Addr->g_addr = g_addr; if (!Add_Global_Address_to_List(assoc, G_Addr)) SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: Address already in list", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); return; /*shouldn't be any other addresses if the zero address is given*/ } else { G_Addr->g_addr.s_addr = ipv4_param->addr; if (!Add_Global_Address_to_List(assoc, G_Addr)) SN_LOG(SN_LOG_EVENT, logsctperror("AddGlobalIPAddress: Address already in list", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); } } bytes_left -= param_size; if (bytes_left < SN_MIN_PARAM_SIZE) break; param = SN_SCTP_NEXTPARAM(param); param_size = SCTP_SIZE32(ntohs(param->param_length)); if (++param_count > sysctl_param_proc_limit) { SN_LOG(SN_LOG_EVENT, logsctperror("Parameter parse limit exceeded (AddGlobalIPAddress)", sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); break; /* limit exceeded*/ } } if (addr_param_count == 0) { SN_LOG(SN_LOG_DETAIL, logsctperror("AddGlobalIPAddress: no address parameters to add", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); } } /** * @brief Add_Global_Address_to_List * * Adds a global IP address to an associations address list, if it is not * already there. The first address added us usually the packet's address, and * is most likely to be used, so it is added at the beginning. Subsequent * addresses are added after this one. * * @param assoc Pointer to the association this SCTP Message belongs to * @param G_addr Pointer to the global address to add * * @return 1 - success | 0 - fail */ static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr) { struct sctp_GlobalAddress *iter_G_Addr = NULL, *first_G_Addr = NULL; first_G_Addr = LIST_FIRST(&(assoc->Gaddr)); if (first_G_Addr == NULL) { LIST_INSERT_HEAD(&(assoc->Gaddr), G_addr, list_Gaddr); /* add new address to beginning of list*/ } else { LIST_FOREACH(iter_G_Addr, &(assoc->Gaddr), list_Gaddr) { if (G_addr->g_addr.s_addr == iter_G_Addr->g_addr.s_addr) return(0); /* already exists, so don't add */ } LIST_INSERT_AFTER(first_G_Addr, G_addr, list_Gaddr); /* add address to end of list*/ } assoc->num_Gaddr++; return(1); /* success */ } /** @ingroup packet_parser * @brief RmGlobalIPAddresses from DelIP packets * * RmGlobalIPAddresses scans an ASCONF chunk for DelIP parameters to remove the * given Global IP addresses from the association. It will not delete the * the address if it is a list of one address. * * * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * */ static void RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction) { struct sctp_asconf_addrv4_param *asconf_ipv4_param; struct sctp_paramhdr *param; struct sctp_GlobalAddress *G_Addr, *G_Addr_tmp; struct in_addr g_addr; int bytes_left; int param_size; int param_count; if(direction == SN_TO_GLOBAL) g_addr = sm->ip_hdr->ip_dst; else g_addr = sm->ip_hdr->ip_src; bytes_left = sm->chunk_length; param_count = 1; param = sm->sctpchnk.Asconf; if (bytes_left >= SN_MIN_PARAM_SIZE) { param_size = SCTP_SIZE32(ntohs(param->param_length)); } else { SN_LOG(SN_LOG_EVENT, logsctperror("RmGlobalIPAddress: truncated packet - cannot remove IP addresses", sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); return; } /* step through Asconf parameters */ while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) { if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS) { asconf_ipv4_param = (struct sctp_asconf_addrv4_param *) param; if (asconf_ipv4_param->addrp.addr == INADDR_ANY) { /* remove all bar pkt address */ LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) { if(G_Addr->g_addr.s_addr != sm->ip_hdr->ip_src.s_addr) { if (assoc->num_Gaddr > 1) { /* only delete if more than one */ LIST_REMOVE(G_Addr, list_Gaddr); sn_free(G_Addr); assoc->num_Gaddr--; } else { SN_LOG(SN_LOG_EVENT, logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); } } } return; /*shouldn't be any other addresses if the zero address is given*/ } else { LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) { if(G_Addr->g_addr.s_addr == asconf_ipv4_param->addrp.addr) { if (assoc->num_Gaddr > 1) { /* only delete if more than one */ LIST_REMOVE(G_Addr, list_Gaddr); sn_free(G_Addr); assoc->num_Gaddr--; break; /* Since add only adds new addresses, there should be no double entries */ } else { SN_LOG(SN_LOG_EVENT, logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)", sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); } } } } } bytes_left -= param_size; if (bytes_left == 0) return; else if (bytes_left < SN_MIN_PARAM_SIZE) { SN_LOG(SN_LOG_EVENT, logsctperror("RmGlobalIPAddress: truncated packet - may not have removed all IP addresses", sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); return; } param = SN_SCTP_NEXTPARAM(param); param_size = SCTP_SIZE32(ntohs(param->param_length)); if (++param_count > sysctl_param_proc_limit) { SN_LOG(SN_LOG_EVENT, logsctperror("Parameter parse limit exceeded (RmGlobalIPAddress)", sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); return; /* limit exceeded*/ } } } /** @ingroup packet_parser * @brief Check that ASCONF was successful * * Each ASCONF configuration parameter carries a correlation ID which should be * matched with an ASCONFack. This is difficult for a NAT, since every * association could potentially have a number of outstanding ASCONF * configuration parameters, which should only be activated on receipt of the * ACK. * * Currently we only look for an ACK when the NAT is setting up a new * association (ie AddIP for a connection that the NAT does not know about * because the original Init went through a public interface or another NAT) * Since there is currently no connection on this path, there should be no other * ASCONF configuration parameters outstanding, so we presume that if there is * an ACK that it is responding to the AddIP and activate the new association. * * @param la Pointer to the relevant libalias instance * @param sm Pointer to sctp message information * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * * @return 1 - success | 0 - fail */ static int IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction) { struct sctp_paramhdr *param; int bytes_left; int param_size; int param_count; param_count = 1; param = sm->sctpchnk.Asconf; param_size = SCTP_SIZE32(ntohs(param->param_length)); if (param_size == 8) return(1); /*success - default acknowledgement of everything */ bytes_left = sm->chunk_length; if (bytes_left < param_size) return(0); /* not found */ /* step through Asconf parameters */ while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) { if (ntohs(param->param_type) == SCTP_SUCCESS_REPORT) return(1); /* success - but can't match correlation IDs - should only be one */ /* check others just in case */ bytes_left -= param_size; if (bytes_left >= SN_MIN_PARAM_SIZE) { param = SN_SCTP_NEXTPARAM(param); } else { return(0); } param_size = SCTP_SIZE32(ntohs(param->param_length)); if (bytes_left < param_size) return(0); if (++param_count > sysctl_param_proc_limit) { SN_LOG(SN_LOG_EVENT, logsctperror("Parameter parse limit exceeded (IsASCONFack)", sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); return(0); /* not found limit exceeded*/ } } return(0); /* not success */ } /** @ingroup packet_parser * @brief Check to see if ASCONF contains an Add IP or Del IP parameter * * IsADDorDEL scans an ASCONF packet to see if it contains an AddIP or DelIP * parameter * * @param la Pointer to the relevant libalias instance * @param sm Pointer to sctp message information * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * * @return SCTP_ADD_IP_ADDRESS | SCTP_DEL_IP_ADDRESS | 0 - fail */ static int IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction) { struct sctp_paramhdr *param; int bytes_left; int param_size; int param_count; param_count = 1; param = sm->sctpchnk.Asconf; param_size = SCTP_SIZE32(ntohs(param->param_length)); bytes_left = sm->chunk_length; if (bytes_left < param_size) return(0); /* not found */ /* step through Asconf parameters */ while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) { if (ntohs(param->param_type) == SCTP_ADD_IP_ADDRESS) return(SCTP_ADD_IP_ADDRESS); else if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS) return(SCTP_DEL_IP_ADDRESS); /* check others just in case */ bytes_left -= param_size; if (bytes_left >= SN_MIN_PARAM_SIZE) { param = SN_SCTP_NEXTPARAM(param); } else { return(0); /*Neither found */ } param_size = SCTP_SIZE32(ntohs(param->param_length)); if (bytes_left < param_size) return(0); if (++param_count > sysctl_param_proc_limit) { SN_LOG(SN_LOG_EVENT, logsctperror("Parameter parse limit exceeded IsADDorDEL)", sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); return(0); /* not found limit exceeded*/ } } return(0); /*Neither found */ } /* ---------------------------------------------------------------------- * STATE MACHINE CODE * ---------------------------------------------------------------------- */ /** @addtogroup state_machine * * The SCTP NAT State Machine functions will: * - Process an already parsed packet * - Use the existing NAT Hash Tables * - Determine the next state for the association * - Update the NAT Hash Tables and Timer Queues * - Return the appropriate action to take with the packet */ /** @ingroup state_machine * @brief Process SCTP message * * This function is the base state machine. It calls the processing engine for * each state. * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_DROP_PKT | SN_NAT_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR | SN_PROCESSING_ERROR */ static int ProcessSctpMsg(struct libalias *la, int direction, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc) { int rtnval; switch (assoc->state) { case SN_ID: /* Idle */ rtnval = ID_process(la, direction, assoc, sm); if (rtnval != SN_NAT_PKT) { assoc->state = SN_RM;/* Mark for removal*/ } return(rtnval); case SN_INi: /* Initialising - Init */ return(INi_process(la, direction, assoc, sm)); case SN_INa: /* Initialising - AddIP */ return(INa_process(la, direction, assoc, sm)); case SN_UP: /* Association UP */ return(UP_process(la, direction, assoc, sm)); case SN_CL: /* Association Closing */ return(CL_process(la, direction, assoc, sm)); } return(SN_PROCESSING_ERROR); } /** @ingroup state_machine * @brief Process SCTP message while in the Idle state * * This function looks for an Incoming INIT or AddIP message. * * All other SCTP messages are invalid when in SN_ID, and are dropped. * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR */ static int ID_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) { switch(sm->msg) { case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk with ADDIP */ if (!sysctl_accept_global_ootb_addip && (direction == SN_TO_LOCAL)) return(SN_DROP_PKT); /* if this Asconf packet does not contain the Vtag parameters it is of no use in Idle state */ if (!GetAsconfVtags(la, sm, &(assoc->l_vtag), &(assoc->g_vtag), direction)) return(SN_DROP_PKT); case SN_SCTP_INIT: /* a packet containing an INIT chunk or an ASCONF AddIP */ if (sysctl_track_global_addresses) AddGlobalIPAddresses(sm, assoc, direction); switch(direction){ case SN_TO_GLOBAL: assoc->l_addr = sm->ip_hdr->ip_src; assoc->a_addr = FindAliasAddress(la, assoc->l_addr); assoc->l_port = sm->sctp_hdr->src_port; assoc->g_port = sm->sctp_hdr->dest_port; if(sm->msg == SN_SCTP_INIT) assoc->g_vtag = sm->sctpchnk.Init->initiate_tag; if (AddSctpAssocGlobal(la, assoc)) /* DB clash *///**** need to add dst address return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR); if(sm->msg == SN_SCTP_ASCONF) { if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_dst)) /* DB clash */ return(SN_REPLY_ERROR); assoc->TableRegister |= SN_WAIT_TOLOCAL; /* wait for tolocal ack */ } break; case SN_TO_LOCAL: assoc->l_addr = FindSctpRedirectAddress(la, sm); assoc->a_addr = sm->ip_hdr->ip_dst; assoc->l_port = sm->sctp_hdr->dest_port; assoc->g_port = sm->sctp_hdr->src_port; if(sm->msg == SN_SCTP_INIT) assoc->l_vtag = sm->sctpchnk.Init->initiate_tag; if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) /* DB clash */ return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR); if(sm->msg == SN_SCTP_ASCONF) { if (AddSctpAssocGlobal(la, assoc)) /* DB clash */ //**** need to add src address return(SN_REPLY_ERROR); assoc->TableRegister |= SN_WAIT_TOGLOBAL; /* wait for toglobal ack */ } break; } assoc->state = (sm->msg == SN_SCTP_INIT) ? SN_INi : SN_INa; assoc->exp = SN_I_T(la); sctp_AddTimeOut(la,assoc); return(SN_NAT_PKT); default: /* Any other type of SCTP message is not valid in Idle */ return(SN_DROP_PKT); } return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ } /** @ingroup state_machine * @brief Process SCTP message while waiting for an INIT-ACK message * * Only an INIT-ACK, resent INIT, or an ABORT SCTP packet are valid in this * state, all other packets are dropped. * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT */ static int INi_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) { switch(sm->msg) { case SN_SCTP_INIT: /* a packet containing a retransmitted INIT chunk */ sctp_ResetTimeOut(la, assoc, SN_I_T(la)); return(SN_NAT_PKT); case SN_SCTP_INITACK: /* a packet containing an INIT-ACK chunk */ switch(direction){ case SN_TO_LOCAL: if (assoc->num_Gaddr) /*If tracking global addresses for this association */ AddGlobalIPAddresses(sm, assoc, direction); assoc->l_vtag = sm->sctpchnk.Init->initiate_tag; if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) { /* DB clash */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_SEND_ABORT); } break; case SN_TO_GLOBAL: assoc->l_addr = sm->ip_hdr->ip_src; // Only if not set in Init! * assoc->g_vtag = sm->sctpchnk.Init->initiate_tag; if (AddSctpAssocGlobal(la, assoc)) { /* DB clash */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_SEND_ABORT); } break; } assoc->state = SN_UP;/* association established for NAT */ sctp_ResetTimeOut(la,assoc, SN_U_T(la)); return(SN_NAT_PKT); case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); default: return(SN_DROP_PKT); } return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ } /** @ingroup state_machine * @brief Process SCTP message while waiting for an AddIp-ACK message * * Only an AddIP-ACK, resent AddIP, or an ABORT message are valid, all other * SCTP packets are dropped * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_NAT_PKT | SN_DROP_PKT */ static int INa_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) { switch(sm->msg) { case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/ sctp_ResetTimeOut(la,assoc, SN_I_T(la)); return(SN_NAT_PKT); case SN_SCTP_ASCONFACK: /* a packet containing an ASCONF chunk with a ADDIP-ACK */ switch(direction){ case SN_TO_LOCAL: if (!(assoc->TableRegister & SN_WAIT_TOLOCAL)) /* wrong direction */ return(SN_DROP_PKT); break; case SN_TO_GLOBAL: if (!(assoc->TableRegister & SN_WAIT_TOGLOBAL)) /* wrong direction */ return(SN_DROP_PKT); } if (IsASCONFack(la,sm,direction)) { assoc->TableRegister &= SN_BOTH_TBL; /* remove wait flags */ assoc->state = SN_UP; /* association established for NAT */ sctp_ResetTimeOut(la,assoc, SN_U_T(la)); return(SN_NAT_PKT); } else { assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); } case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); default: return(SN_DROP_PKT); } return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ } /** @ingroup state_machine * @brief Process SCTP messages while association is UP redirecting packets * * While in the SN_UP state, all packets for the particular association * are passed. Only a SHUT-ACK or an ABORT will cause a change of state. * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_NAT_PKT | SN_DROP_PKT */ static int UP_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) { switch(sm->msg) { case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */ assoc->state = SN_CL; sctp_ResetTimeOut(la,assoc, SN_C_T(la)); return(SN_NAT_PKT); case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/ if ((direction == SN_TO_LOCAL) && assoc->num_Gaddr) /*If tracking global addresses for this association & from global side */ switch(IsADDorDEL(la,sm,direction)) { case SCTP_ADD_IP_ADDRESS: AddGlobalIPAddresses(sm, assoc, direction); break; case SCTP_DEL_IP_ADDRESS: RmGlobalIPAddresses(sm, assoc, direction); break; } /* fall through to default */ default: sctp_ResetTimeOut(la,assoc, SN_U_T(la)); return(SN_NAT_PKT); /* forward packet */ } return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ } /** @ingroup state_machine * @brief Process SCTP message while association is in the process of closing * * This function waits for a SHUT-COMP to close the association. Depending on * the setting of sysctl_holddown_timer it may not remove the association * immediately, but leave it up until SN_X_T(la). Only SHUT-COMP, SHUT-ACK, and * ABORT packets are permitted in this state. All other packets are dropped. * * @param la Pointer to the relevant libalias instance * @param direction SN_TO_LOCAL | SN_TO_GLOBAL * @param sm Pointer to sctp message information * @param assoc Pointer to the association this SCTP Message belongs to * * @return SN_NAT_PKT | SN_DROP_PKT */ static int CL_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) { switch(sm->msg) { case SN_SCTP_SHUTCOMP: /* a packet containing a SHUTDOWN-COMPLETE chunk */ assoc->state = SN_CL; /* Stay in Close state until timeout */ if (sysctl_holddown_timer > 0) sctp_ResetTimeOut(la, assoc, SN_X_T(la));/* allow to stay open for Tbit packets*/ else assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */ assoc->state = SN_CL; /* Stay in Close state until timeout */ sctp_ResetTimeOut(la, assoc, SN_C_T(la)); return(SN_NAT_PKT); case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ assoc->state = SN_RM;/* Mark for removal*/ return(SN_NAT_PKT); default: return(SN_DROP_PKT); } return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ } /* ---------------------------------------------------------------------- * HASH TABLE CODE * ---------------------------------------------------------------------- */ /** @addtogroup Hash * * The Hash functions facilitate searching the NAT Hash Tables for associations * as well as adding/removing associations from the table(s). */ /** @ingroup Hash * @brief Find the SCTP association given the local address, port and vtag * * Searches the local look-up table for the association entry matching the * provided local tuple * * @param la Pointer to the relevant libalias instance * @param l_addr local address * @param g_addr global address * @param l_vtag local Vtag * @param l_port local Port * @param g_port global Port * * @return pointer to association or NULL */ static struct sctp_nat_assoc* FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port) { u_int i; struct sctp_nat_assoc *assoc = NULL; struct sctp_GlobalAddress *G_Addr = NULL; if (l_vtag != 0) { /* an init packet, vtag==0 */ i = SN_TABLE_HASH(l_vtag, l_port, la->sctpNatTableSize); LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { if ((assoc->l_vtag == l_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)\ && (assoc->l_addr.s_addr == l_addr.s_addr)) { if (assoc->num_Gaddr) { LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { if(G_Addr->g_addr.s_addr == g_addr.s_addr) return(assoc); } } else { return(assoc); } } } } return(NULL); } /** @ingroup Hash * @brief Check for Global Clash * * Searches the global look-up table for the association entry matching the * provided global <(addresses):ports:vtag> tuple * * @param la Pointer to the relevant libalias instance * @param Cassoc association being checked for a clash * * @return pointer to association or NULL */ static struct sctp_nat_assoc* FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc) { u_int i; struct sctp_nat_assoc *assoc = NULL; struct sctp_GlobalAddress *G_Addr = NULL; struct sctp_GlobalAddress *G_AddrC = NULL; if (Cassoc->g_vtag != 0) { /* an init packet, vtag==0 */ i = SN_TABLE_HASH(Cassoc->g_vtag, Cassoc->g_port, la->sctpNatTableSize); LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { if ((assoc->g_vtag == Cassoc->g_vtag) && (assoc->g_port == Cassoc->g_port) && (assoc->l_port == Cassoc->l_port)) { if (assoc->num_Gaddr) { LIST_FOREACH(G_AddrC, &(Cassoc->Gaddr), list_Gaddr) { LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { if(G_Addr->g_addr.s_addr == G_AddrC->g_addr.s_addr) return(assoc); } } } else { return(assoc); } } } } return(NULL); } /** @ingroup Hash * @brief Find the SCTP association given the global port and vtag * * Searches the global look-up table for the association entry matching the * provided global tuple * * If all but the global address match it sets partial_match to 1 to indicate a * partial match. If the NAT is tracking global IP addresses for this * association, the NAT may respond with an ERRORM to request the missing * address to be added. * * @param la Pointer to the relevant libalias instance * @param g_addr global address * @param g_vtag global vtag * @param g_port global port * @param l_port local port * * @return pointer to association or NULL */ static struct sctp_nat_assoc* FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match) { u_int i; struct sctp_nat_assoc *assoc = NULL; struct sctp_GlobalAddress *G_Addr = NULL; *partial_match = 0; if (g_vtag != 0) { /* an init packet, vtag==0 */ i = SN_TABLE_HASH(g_vtag, g_port, la->sctpNatTableSize); LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { if ((assoc->g_vtag == g_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) { *partial_match = 1; if (assoc->num_Gaddr) { LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { if(G_Addr->g_addr.s_addr == g_addr.s_addr) return(assoc); } } else { return(assoc); } } } } return(NULL); } /** @ingroup Hash * @brief Find the SCTP association for a T-Flag message (given the global port and local vtag) * * Searches the local look-up table for a unique association entry matching the * provided global port and local vtag information * * @param la Pointer to the relevant libalias instance * @param g_addr global address * @param l_vtag local Vtag * @param g_port global Port * @param l_port local Port * * @return pointer to association or NULL */ static struct sctp_nat_assoc* FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port) { u_int i; struct sctp_nat_assoc *assoc = NULL, *lastmatch = NULL; struct sctp_GlobalAddress *G_Addr = NULL; int cnt = 0; if (l_vtag != 0) { /* an init packet, vtag==0 */ i = SN_TABLE_HASH(l_vtag, g_port, la->sctpNatTableSize); LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { if ((assoc->g_vtag == l_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) { if (assoc->num_Gaddr) { LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { if(G_Addr->g_addr.s_addr == G_Addr->g_addr.s_addr) return(assoc); /* full match */ } } else { if (++cnt > 1) return(NULL); lastmatch = assoc; } } } } /* If there is more than one match we do not know which local address to send to */ return( cnt ? lastmatch : NULL ); } /** @ingroup Hash * @brief Find the SCTP association for a T-Flag message (given the local port and global vtag) * * Searches the global look-up table for a unique association entry matching the * provided local port and global vtag information * * @param la Pointer to the relevant libalias instance * @param g_addr global address * @param g_vtag global vtag * @param l_port local port * @param g_port global port * * @return pointer to association or NULL */ static struct sctp_nat_assoc* FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port) { u_int i; struct sctp_nat_assoc *assoc = NULL; struct sctp_GlobalAddress *G_Addr = NULL; if (g_vtag != 0) { /* an init packet, vtag==0 */ i = SN_TABLE_HASH(g_vtag, l_port, la->sctpNatTableSize); LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { if ((assoc->l_vtag == g_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)) { if (assoc->num_Gaddr) { LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { if(G_Addr->g_addr.s_addr == g_addr.s_addr) return(assoc); } } else { return(assoc); } } } } return(NULL); } /** @ingroup Hash * @brief Add the sctp association information to the local look up table * * Searches the local look-up table for an existing association with the same * details. If a match exists and is ONLY in the local look-up table then this * is a repeated INIT packet, we need to remove this association from the * look-up table and add the new association * * The new association is added to the head of the list and state is updated * * @param la Pointer to the relevant libalias instance * @param assoc pointer to sctp association * @param g_addr global address * * @return SN_ADD_OK | SN_ADD_CLASH */ static int AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr) { struct sctp_nat_assoc *found; LIBALIAS_LOCK_ASSERT(la); found = FindSctpLocal(la, assoc->l_addr, g_addr, assoc->l_vtag, assoc->l_port, assoc->g_port); /* * Note that if a different global address initiated this Init, * ie it wasn't resent as presumed: * - the local receiver if receiving it for the first time will establish * an association with the new global host * - if receiving an init from a different global address after sending a * lost initack it will send an initack to the new global host, the first * association attempt will then be blocked if retried. */ if (found != NULL) { if ((found->TableRegister == SN_LOCAL_TBL) && (found->g_port == assoc->g_port)) { /* resent message */ RmSctpAssoc(la, found); sctp_RmTimeOut(la, found); freeGlobalAddressList(found); sn_free(found); } else return(SN_ADD_CLASH); } LIST_INSERT_HEAD(&la->sctpTableLocal[SN_TABLE_HASH(assoc->l_vtag, assoc->l_port, la->sctpNatTableSize)], assoc, list_L); assoc->TableRegister |= SN_LOCAL_TBL; la->sctpLinkCount++; //increment link count if (assoc->TableRegister == SN_BOTH_TBL) { /* libalias log -- controlled by libalias */ if (la->packetAliasMode & PKT_ALIAS_LOG) SctpShowAliasStats(la); SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^")); } return(SN_ADD_OK); } /** @ingroup Hash * @brief Add the sctp association information to the global look up table * * Searches the global look-up table for an existing association with the same * details. If a match exists and is ONLY in the global look-up table then this * is a repeated INIT packet, we need to remove this association from the * look-up table and add the new association * * The new association is added to the head of the list and state is updated * * @param la Pointer to the relevant libalias instance * @param assoc pointer to sctp association * * @return SN_ADD_OK | SN_ADD_CLASH */ static int AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc) { struct sctp_nat_assoc *found; LIBALIAS_LOCK_ASSERT(la); found = FindSctpGlobalClash(la, assoc); if (found != NULL) { if ((found->TableRegister == SN_GLOBAL_TBL) && \ (found->l_addr.s_addr == assoc->l_addr.s_addr) && (found->l_port == assoc->l_port)) { /* resent message */ RmSctpAssoc(la, found); sctp_RmTimeOut(la, found); freeGlobalAddressList(found); sn_free(found); } else return(SN_ADD_CLASH); } LIST_INSERT_HEAD(&la->sctpTableGlobal[SN_TABLE_HASH(assoc->g_vtag, assoc->g_port, la->sctpNatTableSize)], assoc, list_G); assoc->TableRegister |= SN_GLOBAL_TBL; la->sctpLinkCount++; //increment link count if (assoc->TableRegister == SN_BOTH_TBL) { /* libalias log -- controlled by libalias */ if (la->packetAliasMode & PKT_ALIAS_LOG) SctpShowAliasStats(la); SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^")); } return(SN_ADD_OK); } /** @ingroup Hash * @brief Remove the sctp association information from the look up table * * For each of the two (local/global) look-up tables, remove the association * from that table IF it has been registered in that table. * * NOTE: The calling code is responsible for freeing memory allocated to the * association structure itself * * NOTE: The association is NOT removed from the timer queue * * @param la Pointer to the relevant libalias instance * @param assoc pointer to sctp association */ static void RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc) { // struct sctp_nat_assoc *found; if (assoc == NULL) { /* very bad, log and die*/ SN_LOG(SN_LOG_LOW, logsctperror("ERROR: alias_sctp:RmSctpAssoc(NULL)\n", 0, 0, SN_TO_NODIR)); return; } /* log if association is fully up and now closing */ if (assoc->TableRegister == SN_BOTH_TBL) { SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "$")); } LIBALIAS_LOCK_ASSERT(la); if (assoc->TableRegister & SN_LOCAL_TBL) { assoc->TableRegister ^= SN_LOCAL_TBL; la->sctpLinkCount--; //decrement link count LIST_REMOVE(assoc, list_L); } if (assoc->TableRegister & SN_GLOBAL_TBL) { assoc->TableRegister ^= SN_GLOBAL_TBL; la->sctpLinkCount--; //decrement link count LIST_REMOVE(assoc, list_G); } // sn_free(assoc); //Don't remove now, remove if needed later /* libalias logging -- controlled by libalias log definition */ if (la->packetAliasMode & PKT_ALIAS_LOG) SctpShowAliasStats(la); } /** * @ingroup Hash * @brief free the Global Address List memory * * freeGlobalAddressList deletes all global IP addresses in an associations * global IP address list. * * @param assoc */ static void freeGlobalAddressList(struct sctp_nat_assoc *assoc) { struct sctp_GlobalAddress *gaddr1=NULL,*gaddr2=NULL; /*free global address list*/ gaddr1 = LIST_FIRST(&(assoc->Gaddr)); while (gaddr1 != NULL) { gaddr2 = LIST_NEXT(gaddr1, list_Gaddr); sn_free(gaddr1); gaddr1 = gaddr2; } } /* ---------------------------------------------------------------------- * TIMER QUEUE CODE * ---------------------------------------------------------------------- */ /** @addtogroup Timer * * The timer queue management functions are designed to operate efficiently with * a minimum of interaction with the queues. * * Once a timeout is set in the queue it will not be altered in the queue unless * it has to be changed to a shorter time (usually only for aborts and closing). * On a queue timeout, the real expiry time is checked, and if not leq than the * timeout it is requeued (O(1)) at its later time. This is especially important * for normal packets sent during an association. When a timer expires, it is * updated to its new expiration time if necessary, or processed as a * timeout. This means that while in UP state, the timing queue is only altered * every U_T (every few minutes) for a particular association. */ /** @ingroup Timer * @brief Add an association timeout to the timer queue * * Determine the location in the queue to add the timeout and insert the * association into the list at that queue position * * @param la * @param assoc */ static void sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc) { int add_loc; LIBALIAS_LOCK_ASSERT(la); add_loc = assoc->exp - la->sctpNatTimer.loc_time + la->sctpNatTimer.cur_loc; if (add_loc >= SN_TIMER_QUEUE_SIZE) add_loc -= SN_TIMER_QUEUE_SIZE; LIST_INSERT_HEAD(&la->sctpNatTimer.TimerQ[add_loc], assoc, timer_Q); assoc->exp_loc = add_loc; } /** @ingroup Timer * @brief Remove an association from timer queue * * This is an O(1) operation to remove the association pointer from its * current position in the timer queue * * @param la Pointer to the relevant libalias instance * @param assoc pointer to sctp association */ static void sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc) { LIBALIAS_LOCK_ASSERT(la); LIST_REMOVE(assoc, timer_Q);/* Note this is O(1) */ } /** @ingroup Timer * @brief Reset timer in timer queue * * Reset the actual timeout for the specified association. If it is earlier than * the existing timeout, then remove and re-install the association into the * queue * * @param la Pointer to the relevant libalias instance * @param assoc pointer to sctp association * @param newexp New expiration time */ static void sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp) { if (newexp < assoc->exp) { sctp_RmTimeOut(la, assoc); assoc->exp = newexp; sctp_AddTimeOut(la, assoc); } else { assoc->exp = newexp; } } /** @ingroup Timer * @brief Check timer Q against current time * * Loop through each entry in the timer queue since the last time we processed * the timer queue until now (the current time). For each association in the * event list, we remove it from that position in the timer queue and check if * it has really expired. If so we: * - Log the timer expiry * - Remove the association from the NAT tables * - Release the memory used by the association * * If the timer hasn't really expired we place the association into its new * correct position in the timer queue. * * @param la Pointer to the relevant libalias instance */ void sctp_CheckTimers(struct libalias *la) { struct sctp_nat_assoc *assoc; LIBALIAS_LOCK_ASSERT(la); while(la->timeStamp >= la->sctpNatTimer.loc_time) { while (!LIST_EMPTY(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc])) { assoc = LIST_FIRST(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc]); //SLIST_REMOVE_HEAD(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc], timer_Q); LIST_REMOVE(assoc, timer_Q); if (la->timeStamp >= assoc->exp) { /* state expired */ SN_LOG(((assoc->state == SN_CL)?(SN_LOG_DEBUG):(SN_LOG_INFO)), logsctperror("Timer Expired", assoc->g_vtag, assoc->state, SN_TO_NODIR)); RmSctpAssoc(la, assoc); freeGlobalAddressList(assoc); sn_free(assoc); } else {/* state not expired, reschedule timer*/ sctp_AddTimeOut(la, assoc); } } /* Goto next location in the timer queue*/ ++la->sctpNatTimer.loc_time; if (++la->sctpNatTimer.cur_loc >= SN_TIMER_QUEUE_SIZE) la->sctpNatTimer.cur_loc = 0; } } /* ---------------------------------------------------------------------- * LOGGING CODE * ---------------------------------------------------------------------- */ /** @addtogroup Logging * * The logging functions provide logging of different items ranging from logging * a simple message, through logging an association details to logging the * current state of the NAT tables */ /** @ingroup Logging * @brief Log sctp nat errors * * @param errormsg Error message to be logged * @param vtag Current Vtag * @param error Error number * @param direction Direction of packet */ static void logsctperror(char* errormsg, uint32_t vtag, int error, int direction) { char dir; switch(direction) { case SN_TO_LOCAL: dir = 'L'; break; case SN_TO_GLOBAL: dir = 'G'; break; default: dir = '*'; break; } SctpAliasLog("->%c %s (vt=%u) %d\n", dir, errormsg, ntohl(vtag), error); } /** @ingroup Logging * @brief Log what the parser parsed * * @param direction Direction of packet * @param sm Pointer to sctp message information */ static void logsctpparse(int direction, struct sctp_nat_msg *sm) { char *ploc, *pstate; switch(direction) { case SN_TO_LOCAL: ploc = "TO_LOCAL -"; break; case SN_TO_GLOBAL: ploc = "TO_GLOBAL -"; break; default: ploc = ""; } switch(sm->msg) { case SN_SCTP_INIT: pstate = "Init"; break; case SN_SCTP_INITACK: pstate = "InitAck"; break; case SN_SCTP_ABORT: pstate = "Abort"; break; case SN_SCTP_SHUTACK: pstate = "ShutAck"; break; case SN_SCTP_SHUTCOMP: pstate = "ShutComp"; break; case SN_SCTP_ASCONF: pstate = "Asconf"; break; case SN_SCTP_ASCONFACK: pstate = "AsconfAck"; break; case SN_SCTP_OTHER: pstate = "Other"; break; default: pstate = "***ERROR***"; break; } SctpAliasLog("Parsed: %s %s\n", ploc, pstate); } /** @ingroup Logging * @brief Log an SCTP association's details * * @param assoc pointer to sctp association * @param s Character that indicates the state of processing for this packet */ static void logsctpassoc(struct sctp_nat_assoc *assoc, char* s) { struct sctp_GlobalAddress *G_Addr = NULL; char *sp; + char addrbuf[INET_ADDRSTRLEN]; + switch(assoc->state) { case SN_ID: sp = "ID "; break; case SN_INi: sp = "INi "; break; case SN_INa: sp = "INa "; break; case SN_UP: sp = "UP "; break; case SN_CL: sp = "CL "; break; case SN_RM: sp = "RM "; break; default: sp = "***ERROR***"; break; } SctpAliasLog("%sAssoc: %s exp=%u la=%s lv=%u lp=%u gv=%u gp=%u tbl=%d\n", - s, sp, assoc->exp, inet_ntoa(assoc->l_addr), ntohl(assoc->l_vtag), - ntohs(assoc->l_port), ntohl(assoc->g_vtag), ntohs(assoc->g_port), + s, sp, assoc->exp, inet_ntoa_r(assoc->l_addr, addrbuf), + ntohl(assoc->l_vtag), ntohs(assoc->l_port), + ntohl(assoc->g_vtag), ntohs(assoc->g_port), assoc->TableRegister); /* list global addresses */ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { - SctpAliasLog("\t\tga=%s\n",inet_ntoa(G_Addr->g_addr)); + SctpAliasLog("\t\tga=%s\n", + inet_ntoa_r(G_Addr->g_addr, addrbuf)); } } /** @ingroup Logging * @brief Output Global table to log * * @param la Pointer to the relevant libalias instance */ static void logSctpGlobal(struct libalias *la) { u_int i; struct sctp_nat_assoc *assoc = NULL; SctpAliasLog("G->\n"); for (i=0; i < la->sctpNatTableSize; i++) { LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { logsctpassoc(assoc, " "); } } } /** @ingroup Logging * @brief Output Local table to log * * @param la Pointer to the relevant libalias instance */ static void logSctpLocal(struct libalias *la) { u_int i; struct sctp_nat_assoc *assoc = NULL; SctpAliasLog("L->\n"); for (i=0; i < la->sctpNatTableSize; i++) { LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { logsctpassoc(assoc, " "); } } } /** @ingroup Logging * @brief Output timer queue to log * * @param la Pointer to the relevant libalias instance */ static void logTimerQ(struct libalias *la) { static char buf[50]; u_int i; struct sctp_nat_assoc *assoc = NULL; SctpAliasLog("t->\n"); for (i=0; i < SN_TIMER_QUEUE_SIZE; i++) { LIST_FOREACH(assoc, &la->sctpNatTimer.TimerQ[i], timer_Q) { snprintf(buf, 50, " l=%u ",i); //SctpAliasLog(la->logDesc," l=%d ",i); logsctpassoc(assoc, buf); } } } /** @ingroup Logging * @brief Sctp NAT logging function * * This function is based on a similar function in alias_db.c * * @param str/stream logging descriptor * @param format printf type string */ #ifdef _KERNEL static void SctpAliasLog(const char *format, ...) { char buffer[LIBALIAS_BUF_SIZE]; va_list ap; va_start(ap, format); vsnprintf(buffer, LIBALIAS_BUF_SIZE, format, ap); va_end(ap); log(LOG_SECURITY | LOG_INFO, "alias_sctp: %s", buffer); } #else static void SctpAliasLog(FILE *stream, const char *format, ...) { va_list ap; va_start(ap, format); vfprintf(stream, format, ap); va_end(ap); fflush(stream); } #endif Index: head/sys/netinet/tcp_hostcache.c =================================================================== --- head/sys/netinet/tcp_hostcache.c (revision 313820) +++ head/sys/netinet/tcp_hostcache.c (revision 313821) @@ -1,744 +1,746 @@ /*- * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The tcp_hostcache moves the tcp-specific cached metrics from the routing * table to a dedicated structure indexed by the remote IP address. It keeps * information on the measured TCP parameters of past TCP sessions to allow * better initial start values to be used with later connections to/from the * same source. Depending on the network parameters (delay, max MTU, * congestion window) between local and remote sites, this can lead to * significant speed-ups for new TCP connections after the first one. * * Due to the tcp_hostcache, all TCP-specific metrics information in the * routing table have been removed. The inpcb no longer keeps a pointer to * the routing entry, and protocol-initiated route cloning has been removed * as well. With these changes, the routing table has gone back to being * more lightwight and only carries information related to packet forwarding. * * tcp_hostcache is designed for multiple concurrent access in SMP * environments and high contention. All bucket rows have their own lock and * thus multiple lookups and modifies can be done at the same time as long as * they are in different bucket rows. If a request for insertion of a new * record can't be satisfied, it simply returns an empty structure. Nobody * and nothing outside of tcp_hostcache.c will ever point directly to any * entry in the tcp_hostcache. All communication is done in an * object-oriented way and only functions of tcp_hostcache will manipulate * hostcache entries. Otherwise, we are unable to achieve good behaviour in * concurrent access situations. Since tcp_hostcache is only caching * information, there are no fatal consequences if we either can't satisfy * any particular request or have to drop/overwrite an existing entry because * of bucket limit memory constrains. */ /* * Many thanks to jlemon for basic structure of tcp_syncache which is being * followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #ifdef INET6 #include #endif #include /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache); #define V_tcp_hostcache VNET(tcp_hostcache) static VNET_DEFINE(struct callout, tcp_hc_callout); #define V_tcp_hc_callout VNET(tcp_hc_callout) static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge_internal(int); static void tcp_hc_purge(void *); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); VNET_DEFINE(int, tcp_use_hostcache) = 1; #define V_tcp_use_hostcache VNET(tcp_use_hostcache) SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_use_hostcache), 0, "Enable the TCP hostcache"); SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.cache_limit), 0, "Overall entry limit for hostcache"); SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.hashsize), 0, "Size of TCP hostcache hashtable"); SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, "Per-bucket hash limit for hostcache"); SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_hostcache.cache_count), 0, "Current number of entries in hostcache"); SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.expire), 0, "Expire time of TCP hostcache entries"); SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.prune), 0, "Time between purge runs"); SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.purgeall), 0, "Expire all entires on next purge run"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, sysctl_tcp_hc_list, "A", "List of all hostcache entries"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_tcp_hc_purgenow, "I", "Immediately purge all entries"); static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); #define HOSTCACHE_HASH(ip) \ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ V_tcp_hostcache.hashmask) /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ #define HOSTCACHE_HASH6(ip6) \ (((ip6)->s6_addr32[0] ^ \ (ip6)->s6_addr32[1] ^ \ (ip6)->s6_addr32[2] ^ \ (ip6)->s6_addr32[3]) & \ V_tcp_hostcache.hashmask) #define THC_LOCK(lp) mtx_lock(lp) #define THC_UNLOCK(lp) mtx_unlock(lp) void tcp_hc_init(void) { u_int cache_limit; int i; /* * Initialize hostcache structures. */ V_tcp_hostcache.cache_count = 0; V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", &V_tcp_hostcache.hashsize); if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ } V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", &V_tcp_hostcache.bucket_limit); cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; V_tcp_hostcache.cache_limit = cache_limit; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", &V_tcp_hostcache.cache_limit); if (V_tcp_hostcache.cache_limit > cache_limit) V_tcp_hostcache.cache_limit = cache_limit; /* * Allocate the hash table. */ V_tcp_hostcache.hashbase = (struct hc_head *) malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), M_HOSTCACHE, M_WAITOK | M_ZERO); /* * Initialize the hash buckets. */ for (i = 0; i < V_tcp_hostcache.hashsize; i++) { TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); V_tcp_hostcache.hashbase[i].hch_length = 0; mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", NULL, MTX_DEF); } /* * Allocate the hostcache entries. */ V_tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); /* * Set up periodic cache cleanup. */ callout_init(&V_tcp_hc_callout, 1); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, curvnet); } #ifdef VIMAGE void tcp_hc_destroy(void) { int i; callout_drain(&V_tcp_hc_callout); /* Purge all hc entries. */ tcp_hc_purge_internal(1); /* Free the uma zone and the allocated hash table. */ uma_zdestroy(V_tcp_hostcache.zone); for (i = 0; i < V_tcp_hostcache.hashsize; i++) mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx); free(V_tcp_hostcache.hashbase, M_HOSTCACHE); } #endif /* * Internal function: look up an entry in the hostcache or return NULL. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_lookup(struct in_conninfo *inc) { int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; if (!V_tcp_use_hostcache) return NULL; KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_flags & INC_ISIPV6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * Iterate through entries in bucket row looking for a match. */ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { if (inc->inc_flags & INC_ISIPV6) { /* XXX: check ip6_zoneid */ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, sizeof(inc->inc6_faddr)) == 0) return hc_entry; } else { if (memcmp(&inc->inc_faddr, &hc_entry->ip4, sizeof(inc->inc_faddr)) == 0) return hc_entry; } } /* * We were unsuccessful and didn't find anything. */ THC_UNLOCK(&hc_head->hch_mtx); return NULL; } /* * Internal function: insert an entry into the hostcache or return NULL if * unable to allocate a new one. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_insert(struct in_conninfo *inc) { int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; if (!V_tcp_use_hostcache) return NULL; KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_flags & INC_ISIPV6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * If the bucket limit is reached, reuse the least-used element. */ if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); /* * At first we were dropping the last element, just to * reacquire it in the next two lines again, which isn't very * efficient. Instead just reuse the least used element. * We may drop something that is still "in-use" but we can be * "lossy". * Just give up if this bucket row is empty and we don't have * anything to replace. */ if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length--; V_tcp_hostcache.cache_count--; TCPSTAT_INC(tcps_hc_bucketoverflow); #if 0 uma_zfree(V_tcp_hostcache.zone, hc_entry); #endif } else { /* * Allocate a new entry, or balk if not possible. */ hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } } /* * Initialize basic information of hostcache entry. */ bzero(hc_entry, sizeof(*hc_entry)); if (inc->inc_flags & INC_ISIPV6) { hc_entry->ip6 = inc->inc6_faddr; hc_entry->ip6_zoneid = inc->inc6_zoneid; } else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* * Put it upfront. */ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length++; V_tcp_hostcache.cache_count++; TCPSTAT_INC(tcps_hc_added); return hc_entry; } /* * External function: look up an entry in the hostcache and fill out the * supplied TCP metrics structure. Fills in NULL when no entry was found or * a value is not set. */ void tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) { struct hc_metrics *hc_entry; if (!V_tcp_use_hostcache) return; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object. */ if (hc_entry == NULL) { bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); return; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: look up an entry in the hostcache and return the * discovered path MTU. Returns 0 if no entry is found or value is not * set. */ uint32_t tcp_hc_getmtu(struct in_conninfo *inc) { struct hc_metrics *hc_entry; uint32_t mtu; if (!V_tcp_use_hostcache) return 0; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { return 0; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ mtu = hc_entry->rmx_mtu; THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); return mtu; } /* * External function: update the MTU value of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu) { struct hc_metrics *hc_entry; if (!V_tcp_use_hostcache) return; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object, try to insert a new one. */ if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_entry->rmx_mtu = mtu; /* * Put it upfront so we find it faster next time. */ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: update the TCP metrics of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) { struct hc_metrics *hc_entry; if (!V_tcp_use_hostcache) return; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ if (hcml->rmx_rtt != 0) { if (hc_entry->rmx_rtt == 0) hc_entry->rmx_rtt = hcml->rmx_rtt; else hc_entry->rmx_rtt = ((uint64_t)hc_entry->rmx_rtt + (uint64_t)hcml->rmx_rtt) / 2; TCPSTAT_INC(tcps_cachedrtt); } if (hcml->rmx_rttvar != 0) { if (hc_entry->rmx_rttvar == 0) hc_entry->rmx_rttvar = hcml->rmx_rttvar; else hc_entry->rmx_rttvar = ((uint64_t)hc_entry->rmx_rttvar + (uint64_t)hcml->rmx_rttvar) / 2; TCPSTAT_INC(tcps_cachedrttvar); } if (hcml->rmx_ssthresh != 0) { if (hc_entry->rmx_ssthresh == 0) hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; else hc_entry->rmx_ssthresh = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; TCPSTAT_INC(tcps_cachedssthresh); } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) hc_entry->rmx_cwnd = hcml->rmx_cwnd; else hc_entry->rmx_cwnd = ((uint64_t)hc_entry->rmx_cwnd + (uint64_t)hcml->rmx_cwnd) / 2; /* TCPSTAT_INC(tcps_cachedcwnd); */ } if (hcml->rmx_sendpipe != 0) { if (hc_entry->rmx_sendpipe == 0) hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; else hc_entry->rmx_sendpipe = ((uint64_t)hc_entry->rmx_sendpipe + (uint64_t)hcml->rmx_sendpipe) /2; /* TCPSTAT_INC(tcps_cachedsendpipe); */ } if (hcml->rmx_recvpipe != 0) { if (hc_entry->rmx_recvpipe == 0) hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; else hc_entry->rmx_recvpipe = ((uint64_t)hc_entry->rmx_recvpipe + (uint64_t)hcml->rmx_recvpipe) /2; /* TCPSTAT_INC(tcps_cachedrecvpipe); */ } TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * Sysctl function: prints the list and values of all hostcache entries in * unsorted order. */ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { const int linesize = 128; struct sbuf sb; int i, error; struct hc_metrics *hc_entry; + char ip4buf[INET_ADDRSTRLEN]; #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif if (jailed_without_vnet(curthread->td_ucred) != 0) return (EPERM); sbuf_new(&sb, NULL, linesize * (V_tcp_hostcache.cache_count + 1), SBUF_INCLUDENUL); sbuf_printf(&sb, "\nIP address MTU SSTRESH RTT RTTVAR " " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); #define msec(u) (((u) + 500) / 1000) for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { sbuf_printf(&sb, "%-15s %5u %8u %6lums %6lums %8u %8u %8u %4lu " "%4lu %4i\n", - hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : + hc_entry->ip4.s_addr ? + inet_ntoa_r(hc_entry->ip4, ip4buf) : #ifdef INET6 ip6_sprintf(ip6buf, &hc_entry->ip6), #else "IPv6?", #endif hc_entry->rmx_mtu, hc_entry->rmx_ssthresh, msec((u_long)hc_entry->rmx_rtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), msec((u_long)hc_entry->rmx_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))), hc_entry->rmx_cwnd, hc_entry->rmx_sendpipe, hc_entry->rmx_recvpipe, hc_entry->rmx_hits, hc_entry->rmx_updates, hc_entry->rmx_expire); } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return(error); } /* * Caller has to make sure the curvnet is set properly. */ static void tcp_hc_purge_internal(int all) { struct hc_metrics *hc_entry, *hc_next; int i; for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH_SAFE(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) { if (all || hc_entry->rmx_expire <= 0) { TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, hc_entry, rmx_q); uma_zfree(V_tcp_hostcache.zone, hc_entry); V_tcp_hostcache.hashbase[i].hch_length--; V_tcp_hostcache.cache_count--; } else hc_entry->rmx_expire -= V_tcp_hostcache.prune; } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } } /* * Expire and purge (old|all) entries in the tcp_hostcache. Runs * periodically from the callout. */ static void tcp_hc_purge(void *arg) { CURVNET_SET((struct vnet *) arg); int all = 0; if (V_tcp_hostcache.purgeall) { all = 1; V_tcp_hostcache.purgeall = 0; } tcp_hc_purge_internal(all); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, arg); CURVNET_RESTORE(); } /* * Expire and purge all entries in hostcache immediately. */ static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS) { int error, val; val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); tcp_hc_purge_internal(1); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, curvnet); return (0); } Index: head/sys/netpfil/ipfw/ip_fw_log.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_log.c (revision 313820) +++ head/sys/netpfil/ipfw/ip_fw_log.c (revision 313821) @@ -1,411 +1,412 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Logging support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include /* ip6_sprintf() */ #endif #include #ifdef MAC #include #endif /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) #ifdef __APPLE__ #undef snprintf #define snprintf sprintf #define SNPARGS(buf, len) buf + len #define SNP(buf) buf #else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) #endif /* !__APPLE__ */ #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[92], proto[128], fragment[32]; if (V_fw_verbose == 0) { if (args->eh) /* layer2, use orig hdr */ ipfw_bpf_mtap2(args->eh, ETHER_HDR_LEN, m); else { /* Add fake header. Later we will store * more info in the header. */ if (ip->ip_v == 4) ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); else if (ip->ip_v == 6) ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m); else /* Obviously bogus EtherType. */ ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m); } return; } /* the old 'log' function */ fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; V_norule_counter++; if (V_norule_counter == V_verbose_limit) limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB || cmd->opcode == O_TAG || cmd->opcode == O_SETDSCP) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", TARG(cmd->arg1, divert)); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", TARG(cmd->arg1, divert)); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", TARG(cmd->arg1, fib) & 0x7FFF); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", TARG(cmd->arg1, skipto)); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", TARG(cmd->arg1, pipe)); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", TARG(cmd->arg1, pipe)); break; case O_FORWARD_IP: { + char buf[INET_ADDRSTRLEN]; ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", - inet_ntoa(dummyaddr)); + inet_ntoa_r(dummyaddr, buf)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; #ifdef INET6 case O_FORWARD_IP6: { char buf[INET6_ADDRSTRLEN]; ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", ip6_sprintf(buf, &sa->sa.sin6_addr)); if (sa->sa.sin6_port) snprintf(SNPARGS(action2, len), ":%u", sa->sa.sin6_port); } break; #endif case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; case O_REASS: action = "Reass"; break; case O_CALLRETURN: if (cmd->len & F_NOT) action = "Return"; else snprintf(SNPARGS(action2, 0), "Call %d", cmd->arg1); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; u_short ip6f_mf; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 ip6f_mf = offset & IP6F_MORE_FRAG; offset &= IP6F_OFF_MASK; if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset || ip6f_mf) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, ntohs(ip6->ip6_plen) - hlen, ntohs(offset) << 3, ip6f_mf ? "+" : ""); } else #endif { int ipoff, iplen; ipoff = ntohs(ip->ip_off); iplen = ntohs(ip->ip_len); if (ipoff & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), offset << 3, (ipoff & IP_MF) ? "+" : ""); } } #ifdef __FreeBSD__ if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else #endif log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* end of file */