Index: head/sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_tom.c (revision 286000) +++ head/sys/dev/cxgbe/tom/t4_tom.c (revision 286001) @@ -1,1183 +1,1186 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" static struct protosw ddp_protosw; static struct pr_usrreqs ddp_usrreqs; static struct protosw ddp6_protosw; static struct pr_usrreqs ddp6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void queue_tid_release(struct adapter *, int); static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static int add_lip(struct adapter *, struct in6_addr *); static int delete_lip(struct adapter *, struct in6_addr *); static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *); static void init_clip_table(struct adapter *, struct tom_data *); static void update_clip(struct adapter *, void *); static void t4_clip_task(void *, int); static void update_clip_table(struct adapter *, struct tom_data *); static void destroy_clip_table(struct adapter *, struct tom_data *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); static int in6_ifaddr_gen; static eventhandler_tag ifaddr_evhandler; static struct timeout_task clip_task; struct toepcb * alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags) { struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16); if (txqid < 0) txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq; KASSERT(txqid >= pi->first_ofld_txq && txqid < pi->first_ofld_txq + pi->nofldtxq, ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi, pi->first_ofld_txq, pi->nofldtxq)); if (rxqid < 0) rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq; KASSERT(rxqid >= pi->first_ofld_rxq && rxqid < pi->first_ofld_rxq + pi->nofldrxq, ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi, pi->first_ofld_rxq, pi->nofldrxq)); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); toep->td = sc->tom_softc; toep->port = pi; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; toep->ofld_txq = &sc->sge.ofld_txq[txqid]; toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; return (toep); } void free_toepcb(struct toepcb *toep) { KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (toep->ulp_mode == ULP_MODE_TCPDDP) { if (inp->inp_vflag & INP_IPV6) so->so_proto = &ddp6_protosw; else so->so_proto = &ddp_protosw; } SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); if (toep->ulp_mode == ULP_MODE_TCPDDP) release_ddp_resources(toep); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) release_lip(td, toep->ce); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1)); break; default: break; } } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; atomic_add_int(&t->tids_in_use, 1); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = ctx; } void remove_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; t->tid_tab[tid] = NULL; atomic_subtract_int(&t->tids_in_use, 1); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } /* * What mtu_idx to use, given a 4-tuple and/or an MSS cap */ int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, n; KASSERT(inc != NULL || pmss > 0, ("%s: at least one of inc/pmss must be specified", __func__)); mss = inc ? tcp_mssopt(inc) : pmss; if (pmss > 0 && mss > pmss) mss = pmss; if (inc->inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } extern int always_keepalive; #define VIID_SMACIDX(v) (((unsigned int)(v) & 0x7f) << 1) /* * socket so could be a listening socket too. */ uint64_t calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e, int mtu_idx, int rscale, int rx_credits, int ulp_mode) { uint64_t opt0; KASSERT(rx_credits <= M_RCV_BUFSIZ, ("%s: rcv_bufsiz too high", __func__)); opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); if (so != NULL) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int keepalive = always_keepalive || so_options_get(so) & SO_KEEPALIVE; opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); opt0 |= V_KEEP_ALIVE(keepalive != 0); } if (e != NULL) opt0 |= V_L2T_IDX(e->idx); if (pi != NULL) { opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid)); opt0 |= V_TX_CHAN(pi->tx_chan); } return htobe64(opt0); } uint64_t select_ntuple(struct port_info *pi, struct l2t_entry *e) { struct adapter *sc = pi->adapter; struct tp_params *tp = &sc->params.tp; uint16_t viid = pi->viid; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0) { uint32_t vf = G_FW_VIID_VIN(viid); uint32_t pf = G_FW_VIID_PFN(viid); uint32_t vld = G_FW_VIID_VIVLD(viid); ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) | V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } void set_tcpddp_ulp_mode(struct toepcb *toep) { toep->ulp_mode = ULP_MODE_TCPDDP; toep->ddp_flags = DDP_OK; toep->ddp_score = DDP_LOW_SCORE; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tabs(struct tid_info *t) { size_t size; unsigned int i; size = t->ntids * sizeof(*t->tid_tab) + t->natids * sizeof(*t->atid_tab) + t->nstids * sizeof(*t->stid_tab); t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); if (t->tid_tab == NULL) return (ENOMEM); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids]; t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tabs(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); } static int add_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static int delete_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); /* mtx_assert(&td->clip_table_lock, MA_OWNED); */ memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } static struct clip_entry * search_lip(struct tom_data *td, struct in6_addr *lip) { struct clip_entry *ce; mtx_assert(&td->clip_table_lock, MA_OWNED); TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) return (ce); } return (NULL); } struct clip_entry * hold_lip(struct tom_data *td, struct in6_addr *lip) { struct clip_entry *ce; mtx_lock(&td->clip_table_lock); ce = search_lip(td, lip); if (ce != NULL) ce->refcount++; mtx_unlock(&td->clip_table_lock); return (ce); } void release_lip(struct tom_data *td, struct clip_entry *ce) { mtx_lock(&td->clip_table_lock); KASSERT(search_lip(td, &ce->lip) == ce, ("%s: CLIP entry %p p not in CLIP table.", __func__, ce)); KASSERT(ce->refcount > 0, ("%s: CLIP entry %p has refcount 0", __func__, ce)); --ce->refcount; mtx_unlock(&td->clip_table_lock); } static void init_clip_table(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF); TAILQ_INIT(&td->clip_table); td->clip_gen = -1; update_clip_table(sc, td); } static void update_clip(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc")) return; if (uld_active(sc, ULD_TOM)) update_clip_table(sc, sc->tom_softc); end_synchronized_op(sc, LOCK_HELD); } static void t4_clip_task(void *arg, int count) { t4_iterate(update_clip, NULL); } static void update_clip_table(struct adapter *sc, struct tom_data *td) { + struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; struct in6_addr *lip, tlip; struct clip_head stale; struct clip_entry *ce, *ce_temp; int rc, gen = atomic_load_acq_int(&in6_ifaddr_gen); ASSERT_SYNCHRONIZED_OP(sc); - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); mtx_lock(&td->clip_table_lock); if (gen == td->clip_gen) goto done; TAILQ_INIT(&stale); TAILQ_CONCAT(&stale, &td->clip_table, link); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { lip = &ia->ia_addr.sin6_addr; KASSERT(!IN6_IS_ADDR_MULTICAST(lip), ("%s: mcast address in in6_ifaddr list", __func__)); if (IN6_IS_ADDR_LOOPBACK(lip)) continue; if (IN6_IS_SCOPE_EMBED(lip)) { /* Remove the embedded scope */ tlip = *lip; lip = &tlip; in6_clearscope(lip); } /* * XXX: how to weed out the link local address for the loopback * interface? It's fe80::1 usually (always?). */ /* * If it's in the main list then we already know it's not stale. */ TAILQ_FOREACH(ce, &td->clip_table, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) goto next; } /* * If it's in the stale list we should move it to the main list. */ TAILQ_FOREACH(ce, &stale, link) { if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { TAILQ_REMOVE(&stale, ce, link); TAILQ_INSERT_TAIL(&td->clip_table, ce, link); goto next; } } /* A new IP6 address; add it to the CLIP table */ ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); memcpy(&ce->lip, lip, sizeof(ce->lip)); ce->refcount = 0; rc = add_lip(sc, lip); if (rc == 0) TAILQ_INSERT_TAIL(&td->clip_table, ce, link); else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not add %s (%d)\n", __func__, ip, rc); free(ce, M_CXGBE); } next: continue; } /* * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are * no longer referenced by the driver. */ TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) { if (ce->refcount == 0) { rc = delete_lip(sc, &ce->lip); if (rc == 0) { TAILQ_REMOVE(&stale, ce, link); free(ce, M_CXGBE); } else { char ip[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); log(LOG_ERR, "%s: could not delete %s (%d)\n", __func__, ip, rc); } } } /* The ones that are still referenced need to stay in the CLIP table */ TAILQ_CONCAT(&td->clip_table, &stale, link); td->clip_gen = gen; done: mtx_unlock(&td->clip_table_lock); - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); } static void destroy_clip_table(struct adapter *sc, struct tom_data *td) { struct clip_entry *ce, *ce_temp; if (mtx_initialized(&td->clip_table_lock)) { mtx_lock(&td->clip_table_lock); TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) { KASSERT(ce->refcount == 0, ("%s: CLIP entry %p still in use (%d)", __func__, ce, ce->refcount)); TAILQ_REMOVE(&td->clip_table, ce, link); delete_lip(sc, &ce->lip); free(ce, M_CXGBE); } mtx_unlock(&td->clip_table_lock); mtx_destroy(&td->clip_table_lock); } } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_uninit_l2t_cpl_handlers(sc); t4_uninit_cpl_io_handlers(sc); t4_uninit_ddp(sc, td); destroy_clip_table(sc, td); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid; struct wrqe *wr; struct adapter *sc; mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); sc = td_adapter(td); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; int i, rc; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; /* DDP page pods and CPL handlers */ t4_init_ddp(sc, td); /* CLIP table for IPv6 offload */ init_clip_table(sc, td); /* CPL handlers */ t4_init_connect_cpl_handlers(sc); t4_init_l2t_cpl_handlers(sc); t4_init_listen_cpl_handlers(sc); t4_init_cpl_io_handlers(sc); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; for_each_port(sc, i) TOEDEV(sc->port[i]->ifp) = &td->tod; sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static void t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp) { atomic_add_rel_int(&in6_ifaddr_gen, 1); taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4); } static int t4_tom_mod_load(void) { int rc; struct protosw *tcp_protosw, *tcp6_protosw; tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw)); bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs)); ddp_usrreqs.pru_soreceive = t4_soreceive_ddp; ddp_protosw.pr_usrreqs = &ddp_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &ddp6_protosw, sizeof(ddp6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &ddp6_usrreqs, sizeof(ddp6_usrreqs)); ddp6_usrreqs.pru_soreceive = t4_soreceive_ddp; ddp6_protosw.pr_usrreqs = &ddp6_usrreqs; TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event, t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); rc = t4_register_uld(&tom_uld_info); if (rc != 0) t4_tom_mod_unload(); return (rc); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); if (ifaddr_evhandler) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler); taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL); } return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/net/if_spppsubr.c =================================================================== --- head/sys/net/if_spppsubr.c (revision 286000) +++ head/sys/net/if_spppsubr.c (revision 286001) @@ -1,5421 +1,5423 @@ /* * Synchronous PPP/Cisco/Frame Relay link level subroutines. * Keepalive protocol implemented in both Cisco and PPP modes. */ /*- * Copyright (C) 1994-2000 Cronyx Engineering. * Author: Serge Vakulenko, * * Heavily revamped to conform to RFC 1661. * Copyright (C) 1997, 2001 Joerg Wunsch. * * This software is distributed with NO WARRANTIES, not even the implied * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Authors grant any other persons or organisations permission to use * or modify this software as long as this message is kept with the software, * all derivative works or modified versions. * * From: Version 2.4, Thu Apr 30 17:17:21 MSD 1997 * * $FreeBSD$ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include +#include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 #include #endif #include #include #define IOCTL_CMD_T u_long #define MAXALIVECNT 3 /* max. alive packets */ /* * Interface flags that can be set in an ifconfig command. * * Setting link0 will make the link passive, i.e. it will be marked * as being administrative openable, but won't be opened to begin * with. Incoming calls will be answered, or subsequent calls with * -link1 will cause the administrative open of the LCP layer. * * Setting link1 will cause the link to auto-dial only as packets * arrive to be sent. * * Setting IFF_DEBUG will syslog the option negotiation and state * transitions at level kern.debug. Note: all logs consistently look * like * * : * * with being something like "bppp0", and * being one of "lcp", "ipcp", "cisco", "chap", "pap", etc. */ #define IFF_PASSIVE IFF_LINK0 /* wait passively for connection */ #define IFF_AUTO IFF_LINK1 /* auto-dial on output */ #define IFF_CISCO IFF_LINK2 /* auto-dial on output */ #define PPP_ALLSTATIONS 0xff /* All-Stations broadcast address */ #define PPP_UI 0x03 /* Unnumbered Information */ #define PPP_IP 0x0021 /* Internet Protocol */ #define PPP_ISO 0x0023 /* ISO OSI Protocol */ #define PPP_XNS 0x0025 /* Xerox NS Protocol */ #define PPP_IPX 0x002b /* Novell IPX Protocol */ #define PPP_VJ_COMP 0x002d /* VJ compressed TCP/IP */ #define PPP_VJ_UCOMP 0x002f /* VJ uncompressed TCP/IP */ #define PPP_IPV6 0x0057 /* Internet Protocol Version 6 */ #define PPP_LCP 0xc021 /* Link Control Protocol */ #define PPP_PAP 0xc023 /* Password Authentication Protocol */ #define PPP_CHAP 0xc223 /* Challenge-Handshake Auth Protocol */ #define PPP_IPCP 0x8021 /* Internet Protocol Control Protocol */ #define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */ #define CONF_REQ 1 /* PPP configure request */ #define CONF_ACK 2 /* PPP configure acknowledge */ #define CONF_NAK 3 /* PPP configure negative ack */ #define CONF_REJ 4 /* PPP configure reject */ #define TERM_REQ 5 /* PPP terminate request */ #define TERM_ACK 6 /* PPP terminate acknowledge */ #define CODE_REJ 7 /* PPP code reject */ #define PROTO_REJ 8 /* PPP protocol reject */ #define ECHO_REQ 9 /* PPP echo request */ #define ECHO_REPLY 10 /* PPP echo reply */ #define DISC_REQ 11 /* PPP discard request */ #define LCP_OPT_MRU 1 /* maximum receive unit */ #define LCP_OPT_ASYNC_MAP 2 /* async control character map */ #define LCP_OPT_AUTH_PROTO 3 /* authentication protocol */ #define LCP_OPT_QUAL_PROTO 4 /* quality protocol */ #define LCP_OPT_MAGIC 5 /* magic number */ #define LCP_OPT_RESERVED 6 /* reserved */ #define LCP_OPT_PROTO_COMP 7 /* protocol field compression */ #define LCP_OPT_ADDR_COMP 8 /* address/control field compression */ #define IPCP_OPT_ADDRESSES 1 /* both IP addresses; deprecated */ #define IPCP_OPT_COMPRESSION 2 /* IP compression protocol (VJ) */ #define IPCP_OPT_ADDRESS 3 /* local IP address */ #define IPV6CP_OPT_IFID 1 /* interface identifier */ #define IPV6CP_OPT_COMPRESSION 2 /* IPv6 compression protocol */ #define IPCP_COMP_VJ 0x2d /* Code for VJ compression */ #define PAP_REQ 1 /* PAP name/password request */ #define PAP_ACK 2 /* PAP acknowledge */ #define PAP_NAK 3 /* PAP fail */ #define CHAP_CHALLENGE 1 /* CHAP challenge request */ #define CHAP_RESPONSE 2 /* CHAP challenge response */ #define CHAP_SUCCESS 3 /* CHAP response ok */ #define CHAP_FAILURE 4 /* CHAP response failed */ #define CHAP_MD5 5 /* hash algorithm - MD5 */ #define CISCO_MULTICAST 0x8f /* Cisco multicast address */ #define CISCO_UNICAST 0x0f /* Cisco unicast address */ #define CISCO_KEEPALIVE 0x8035 /* Cisco keepalive protocol */ #define CISCO_ADDR_REQ 0 /* Cisco address request */ #define CISCO_ADDR_REPLY 1 /* Cisco address reply */ #define CISCO_KEEPALIVE_REQ 2 /* Cisco keepalive request */ /* states are named and numbered according to RFC 1661 */ #define STATE_INITIAL 0 #define STATE_STARTING 1 #define STATE_CLOSED 2 #define STATE_STOPPED 3 #define STATE_CLOSING 4 #define STATE_STOPPING 5 #define STATE_REQ_SENT 6 #define STATE_ACK_RCVD 7 #define STATE_ACK_SENT 8 #define STATE_OPENED 9 static MALLOC_DEFINE(M_SPPP, "sppp", "synchronous PPP interface internals"); struct ppp_header { u_char address; u_char control; u_short protocol; } __packed; #define PPP_HEADER_LEN sizeof (struct ppp_header) struct lcp_header { u_char type; u_char ident; u_short len; } __packed; #define LCP_HEADER_LEN sizeof (struct lcp_header) struct cisco_packet { u_long type; u_long par1; u_long par2; u_short rel; u_short time0; u_short time1; } __packed; #define CISCO_PACKET_LEN sizeof (struct cisco_packet) /* * We follow the spelling and capitalization of RFC 1661 here, to make * it easier comparing with the standard. Please refer to this RFC in * case you can't make sense out of these abbreviation; it will also * explain the semantics related to the various events and actions. */ struct cp { u_short proto; /* PPP control protocol number */ u_char protoidx; /* index into state table in struct sppp */ u_char flags; #define CP_LCP 0x01 /* this is the LCP */ #define CP_AUTH 0x02 /* this is an authentication protocol */ #define CP_NCP 0x04 /* this is a NCP */ #define CP_QUAL 0x08 /* this is a quality reporting protocol */ const char *name; /* name of this control protocol */ /* event handlers */ void (*Up)(struct sppp *sp); void (*Down)(struct sppp *sp); void (*Open)(struct sppp *sp); void (*Close)(struct sppp *sp); void (*TO)(void *sp); int (*RCR)(struct sppp *sp, struct lcp_header *h, int len); void (*RCN_rej)(struct sppp *sp, struct lcp_header *h, int len); void (*RCN_nak)(struct sppp *sp, struct lcp_header *h, int len); /* actions */ void (*tlu)(struct sppp *sp); void (*tld)(struct sppp *sp); void (*tls)(struct sppp *sp); void (*tlf)(struct sppp *sp); void (*scr)(struct sppp *sp); }; #define SPP_FMT "%s: " #define SPP_ARGS(ifp) (ifp)->if_xname #define SPPP_LOCK(sp) mtx_lock (&(sp)->mtx) #define SPPP_UNLOCK(sp) mtx_unlock (&(sp)->mtx) #define SPPP_LOCK_ASSERT(sp) mtx_assert (&(sp)->mtx, MA_OWNED) #define SPPP_LOCK_OWNED(sp) mtx_owned (&(sp)->mtx) #ifdef INET /* * The following disgusting hack gets around the problem that IP TOS * can't be set yet. We want to put "interactive" traffic on a high * priority queue. To decide if traffic is interactive, we check that * a) it is TCP and b) one of its ports is telnet, rlogin or ftp control. * * XXX is this really still necessary? - joerg - */ static const u_short interactive_ports[8] = { 0, 513, 0, 0, 0, 21, 0, 23, }; #define INTERACTIVE(p) (interactive_ports[(p) & 7] == (p)) #endif /* almost every function needs these */ #define STDDCL \ struct ifnet *ifp = SP2IFP(sp); \ int debug = ifp->if_flags & IFF_DEBUG static int sppp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2); static void sppp_cisco_input(struct sppp *sp, struct mbuf *m); static void sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m); static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type, u_char ident, u_short len, void *data); /* static void sppp_cp_timeout(void *arg); */ static void sppp_cp_change_state(const struct cp *cp, struct sppp *sp, int newstate); static void sppp_auth_send(const struct cp *cp, struct sppp *sp, unsigned int type, unsigned int id, ...); static void sppp_up_event(const struct cp *cp, struct sppp *sp); static void sppp_down_event(const struct cp *cp, struct sppp *sp); static void sppp_open_event(const struct cp *cp, struct sppp *sp); static void sppp_close_event(const struct cp *cp, struct sppp *sp); static void sppp_to_event(const struct cp *cp, struct sppp *sp); static void sppp_null(struct sppp *sp); static void sppp_pp_up(struct sppp *sp); static void sppp_pp_down(struct sppp *sp); static void sppp_lcp_init(struct sppp *sp); static void sppp_lcp_up(struct sppp *sp); static void sppp_lcp_down(struct sppp *sp); static void sppp_lcp_open(struct sppp *sp); static void sppp_lcp_close(struct sppp *sp); static void sppp_lcp_TO(void *sp); static int sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_tlu(struct sppp *sp); static void sppp_lcp_tld(struct sppp *sp); static void sppp_lcp_tls(struct sppp *sp); static void sppp_lcp_tlf(struct sppp *sp); static void sppp_lcp_scr(struct sppp *sp); static void sppp_lcp_check_and_close(struct sppp *sp); static int sppp_ncp_check(struct sppp *sp); static void sppp_ipcp_init(struct sppp *sp); static void sppp_ipcp_up(struct sppp *sp); static void sppp_ipcp_down(struct sppp *sp); static void sppp_ipcp_open(struct sppp *sp); static void sppp_ipcp_close(struct sppp *sp); static void sppp_ipcp_TO(void *sp); static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_tlu(struct sppp *sp); static void sppp_ipcp_tld(struct sppp *sp); static void sppp_ipcp_tls(struct sppp *sp); static void sppp_ipcp_tlf(struct sppp *sp); static void sppp_ipcp_scr(struct sppp *sp); static void sppp_ipv6cp_init(struct sppp *sp); static void sppp_ipv6cp_up(struct sppp *sp); static void sppp_ipv6cp_down(struct sppp *sp); static void sppp_ipv6cp_open(struct sppp *sp); static void sppp_ipv6cp_close(struct sppp *sp); static void sppp_ipv6cp_TO(void *sp); static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_tlu(struct sppp *sp); static void sppp_ipv6cp_tld(struct sppp *sp); static void sppp_ipv6cp_tls(struct sppp *sp); static void sppp_ipv6cp_tlf(struct sppp *sp); static void sppp_ipv6cp_scr(struct sppp *sp); static void sppp_pap_input(struct sppp *sp, struct mbuf *m); static void sppp_pap_init(struct sppp *sp); static void sppp_pap_open(struct sppp *sp); static void sppp_pap_close(struct sppp *sp); static void sppp_pap_TO(void *sp); static void sppp_pap_my_TO(void *sp); static void sppp_pap_tlu(struct sppp *sp); static void sppp_pap_tld(struct sppp *sp); static void sppp_pap_scr(struct sppp *sp); static void sppp_chap_input(struct sppp *sp, struct mbuf *m); static void sppp_chap_init(struct sppp *sp); static void sppp_chap_open(struct sppp *sp); static void sppp_chap_close(struct sppp *sp); static void sppp_chap_TO(void *sp); static void sppp_chap_tlu(struct sppp *sp); static void sppp_chap_tld(struct sppp *sp); static void sppp_chap_scr(struct sppp *sp); static const char *sppp_auth_type_name(u_short proto, u_char type); static const char *sppp_cp_type_name(u_char type); #ifdef INET static const char *sppp_dotted_quad(u_long addr); static const char *sppp_ipcp_opt_name(u_char opt); #endif #ifdef INET6 static const char *sppp_ipv6cp_opt_name(u_char opt); #endif static const char *sppp_lcp_opt_name(u_char opt); static const char *sppp_phase_name(enum ppp_phase phase); static const char *sppp_proto_name(u_short proto); static const char *sppp_state_name(int state); static int sppp_params(struct sppp *sp, u_long cmd, void *data); static int sppp_strnlen(u_char *p, int max); static void sppp_keepalive(void *dummy); static void sppp_phase_network(struct sppp *sp); static void sppp_print_bytes(const u_char *p, u_short len); static void sppp_print_string(const char *p, u_short len); static void sppp_qflush(struct ifqueue *ifq); #ifdef INET static void sppp_set_ip_addr(struct sppp *sp, u_long src); #endif #ifdef INET6 static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, struct in6_addr *dst, struct in6_addr *srcmask); #ifdef IPV6CP_MYIFID_DYN static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src); static void sppp_gen_ip6_addr(struct sppp *sp, const struct in6_addr *src); #endif static void sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *src); #endif /* if_start () wrapper */ static void sppp_ifstart (struct ifnet *ifp); /* our control protocol descriptors */ static const struct cp lcp = { PPP_LCP, IDX_LCP, CP_LCP, "lcp", sppp_lcp_up, sppp_lcp_down, sppp_lcp_open, sppp_lcp_close, sppp_lcp_TO, sppp_lcp_RCR, sppp_lcp_RCN_rej, sppp_lcp_RCN_nak, sppp_lcp_tlu, sppp_lcp_tld, sppp_lcp_tls, sppp_lcp_tlf, sppp_lcp_scr }; static const struct cp ipcp = { PPP_IPCP, IDX_IPCP, #ifdef INET /* don't run IPCP if there's no IPv4 support */ CP_NCP, #else 0, #endif "ipcp", sppp_ipcp_up, sppp_ipcp_down, sppp_ipcp_open, sppp_ipcp_close, sppp_ipcp_TO, sppp_ipcp_RCR, sppp_ipcp_RCN_rej, sppp_ipcp_RCN_nak, sppp_ipcp_tlu, sppp_ipcp_tld, sppp_ipcp_tls, sppp_ipcp_tlf, sppp_ipcp_scr }; static const struct cp ipv6cp = { PPP_IPV6CP, IDX_IPV6CP, #ifdef INET6 /*don't run IPv6CP if there's no IPv6 support*/ CP_NCP, #else 0, #endif "ipv6cp", sppp_ipv6cp_up, sppp_ipv6cp_down, sppp_ipv6cp_open, sppp_ipv6cp_close, sppp_ipv6cp_TO, sppp_ipv6cp_RCR, sppp_ipv6cp_RCN_rej, sppp_ipv6cp_RCN_nak, sppp_ipv6cp_tlu, sppp_ipv6cp_tld, sppp_ipv6cp_tls, sppp_ipv6cp_tlf, sppp_ipv6cp_scr }; static const struct cp pap = { PPP_PAP, IDX_PAP, CP_AUTH, "pap", sppp_null, sppp_null, sppp_pap_open, sppp_pap_close, sppp_pap_TO, 0, 0, 0, sppp_pap_tlu, sppp_pap_tld, sppp_null, sppp_null, sppp_pap_scr }; static const struct cp chap = { PPP_CHAP, IDX_CHAP, CP_AUTH, "chap", sppp_null, sppp_null, sppp_chap_open, sppp_chap_close, sppp_chap_TO, 0, 0, 0, sppp_chap_tlu, sppp_chap_tld, sppp_null, sppp_null, sppp_chap_scr }; static const struct cp *cps[IDX_COUNT] = { &lcp, /* IDX_LCP */ &ipcp, /* IDX_IPCP */ &ipv6cp, /* IDX_IPV6CP */ &pap, /* IDX_PAP */ &chap, /* IDX_CHAP */ }; static void* sppp_alloc(u_char type, struct ifnet *ifp) { struct sppp *sp; sp = malloc(sizeof(struct sppp), M_SPPP, M_WAITOK | M_ZERO); sp->pp_ifp = ifp; return (sp); } static void sppp_free(void *com, u_char type) { free(com, M_SPPP); } static int sppp_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: /* * XXX: should probably be IFT_SPPP, but it's fairly * harmless to allocate struct sppp's for non-sppp * interfaces. */ if_register_com_alloc(IFT_PPP, sppp_alloc, sppp_free); break; case MOD_UNLOAD: /* if_deregister_com_alloc(IFT_PPP); */ return EACCES; default: return EOPNOTSUPP; } return 0; } static moduledata_t spppmod = { "sppp", sppp_modevent, 0 }; MODULE_VERSION(sppp, 1); DECLARE_MODULE(sppp, spppmod, SI_SUB_DRIVERS, SI_ORDER_ANY); /* * Exported functions, comprising our interface to the lower layer. */ /* * Process the received packet. */ void sppp_input(struct ifnet *ifp, struct mbuf *m) { struct ppp_header *h; int isr = -1; struct sppp *sp = IFP2SP(ifp); int debug, do_account = 0; #ifdef INET int hlen, vjlen; u_char *iphdr; #endif SPPP_LOCK(sp); debug = ifp->if_flags & IFF_DEBUG; if (ifp->if_flags & IFF_UP) /* Count received bytes, add FCS and one flag */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + 3); if (m->m_pkthdr.len <= PPP_HEADER_LEN) { /* Too small packet, drop it. */ if (debug) log(LOG_DEBUG, SPP_FMT "input packet is too small, %d bytes\n", SPP_ARGS(ifp), m->m_pkthdr.len); drop: m_freem (m); SPPP_UNLOCK(sp); drop2: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return; } if (sp->pp_mode == PP_FR) { sppp_fr_input (sp, m); SPPP_UNLOCK(sp); return; } /* Get PPP header. */ h = mtod (m, struct ppp_header*); m_adj (m, PPP_HEADER_LEN); switch (h->address) { case PPP_ALLSTATIONS: if (h->control != PPP_UI) goto invalid; if (sp->pp_mode == IFF_CISCO) { if (debug) log(LOG_DEBUG, SPP_FMT "PPP packet in Cisco mode " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } switch (ntohs (h->protocol)) { default: if (debug) log(LOG_DEBUG, SPP_FMT "rejecting protocol " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); if (sp->state[IDX_LCP] == STATE_OPENED) sppp_cp_send (sp, PPP_LCP, PROTO_REJ, ++sp->pp_seq[IDX_LCP], m->m_pkthdr.len + 2, &h->protocol); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto drop; case PPP_LCP: sppp_cp_input(&lcp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_PAP: if (sp->pp_phase >= PHASE_AUTHENTICATE) sppp_pap_input(sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_CHAP: if (sp->pp_phase >= PHASE_AUTHENTICATE) sppp_chap_input(sp, m); m_freem (m); SPPP_UNLOCK(sp); return; #ifdef INET case PPP_IPCP: if (sp->pp_phase == PHASE_NETWORK) sppp_cp_input(&ipcp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_IP: if (sp->state[IDX_IPCP] == STATE_OPENED) { isr = NETISR_IP; } do_account++; break; case PPP_VJ_COMP: if (sp->state[IDX_IPCP] == STATE_OPENED) { if ((vjlen = sl_uncompress_tcp_core(mtod(m, u_char *), m->m_len, m->m_len, TYPE_COMPRESSED_TCP, sp->pp_comp, &iphdr, &hlen)) <= 0) { if (debug) log(LOG_INFO, SPP_FMT "VJ uncompress failed on compressed packet\n", SPP_ARGS(ifp)); goto drop; } /* * Trim the VJ header off the packet, and prepend * the uncompressed IP header (which will usually * end up in two chained mbufs since there's not * enough leading space in the existing mbuf). */ m_adj(m, vjlen); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { SPPP_UNLOCK(sp); goto drop2; } bcopy(iphdr, mtod(m, u_char *), hlen); isr = NETISR_IP; } do_account++; break; case PPP_VJ_UCOMP: if (sp->state[IDX_IPCP] == STATE_OPENED) { if (sl_uncompress_tcp_core(mtod(m, u_char *), m->m_len, m->m_len, TYPE_UNCOMPRESSED_TCP, sp->pp_comp, &iphdr, &hlen) != 0) { if (debug) log(LOG_INFO, SPP_FMT "VJ uncompress failed on uncompressed packet\n", SPP_ARGS(ifp)); goto drop; } isr = NETISR_IP; } do_account++; break; #endif #ifdef INET6 case PPP_IPV6CP: if (sp->pp_phase == PHASE_NETWORK) sppp_cp_input(&ipv6cp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_IPV6: if (sp->state[IDX_IPV6CP] == STATE_OPENED) isr = NETISR_IPV6; do_account++; break; #endif } break; case CISCO_MULTICAST: case CISCO_UNICAST: /* Don't check the control field here (RFC 1547). */ if (sp->pp_mode != IFF_CISCO) { if (debug) log(LOG_DEBUG, SPP_FMT "Cisco packet in PPP mode " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } switch (ntohs (h->protocol)) { default: if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto invalid; case CISCO_KEEPALIVE: sppp_cisco_input (sp, m); m_freem (m); SPPP_UNLOCK(sp); return; #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; do_account++; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; do_account++; break; #endif } break; default: /* Invalid PPP packet. */ invalid: if (debug) log(LOG_DEBUG, SPP_FMT "invalid input packet " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } if (! (ifp->if_flags & IFF_UP) || isr == -1) goto drop; SPPP_UNLOCK(sp); M_SETFIB(m, ifp->if_fib); /* Check queue. */ if (netisr_queue(isr, m)) { /* (0) on success. */ if (debug) log(LOG_DEBUG, SPP_FMT "protocol queue overflow\n", SPP_ARGS(ifp)); goto drop2; } if (do_account) /* * Do only account for network packets, not for control * packets. This is used by some subsystems to detect * idle lines. */ sp->pp_last_recv = time_uptime; } static void sppp_ifstart_sched(void *dummy) { struct sppp *sp = dummy; sp->if_start(SP2IFP(sp)); } /* if_start () wrapper function. We use it to schedule real if_start () for * execution. We can't call it directly */ static void sppp_ifstart(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); if (SPPP_LOCK_OWNED(sp)) { if (callout_pending(&sp->ifstart_callout)) return; callout_reset(&sp->ifstart_callout, 1, sppp_ifstart_sched, (void *)sp); } else { sp->if_start(ifp); } } /* * Enqueue transmit packet. */ static int sppp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct sppp *sp = IFP2SP(ifp); struct ppp_header *h; struct ifqueue *ifq = NULL; int error, rv = 0; #ifdef INET int ipproto = PPP_IP; #endif int debug = ifp->if_flags & IFF_DEBUG; SPPP_LOCK(sp); if (!(ifp->if_flags & IFF_UP) || (!(ifp->if_flags & IFF_AUTO) && !(ifp->if_drv_flags & IFF_DRV_RUNNING))) { #ifdef INET6 drop: #endif m_freem (m); SPPP_UNLOCK(sp); return (ENETDOWN); } if ((ifp->if_flags & IFF_AUTO) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { #ifdef INET6 /* * XXX * * Hack to prevent the initialization-time generated * IPv6 multicast packet to erroneously cause a * dialout event in case IPv6 has been * administratively disabled on that interface. */ if (dst->sa_family == AF_INET6 && !(sp->confflags & CONF_ENABLE_IPV6)) goto drop; #endif /* * Interface is not yet running, but auto-dial. Need * to start LCP for it. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } #ifdef INET if (dst->sa_family == AF_INET) { /* XXX Check mbuf length here? */ struct ip *ip = mtod (m, struct ip*); struct tcphdr *tcp = (struct tcphdr*) ((long*)ip + ip->ip_hl); /* * When using dynamic local IP address assignment by using * 0.0.0.0 as a local address, the first TCP session will * not connect because the local TCP checksum is computed * using 0.0.0.0 which will later become our real IP address * so the TCP checksum computed at the remote end will * become invalid. So we * - don't let packets with src ip addr 0 thru * - we flag TCP packets with src ip 0 as an error */ if(ip->ip_src.s_addr == INADDR_ANY) /* -hm */ { m_freem(m); SPPP_UNLOCK(sp); if(ip->ip_p == IPPROTO_TCP) return(EADDRNOTAVAIL); else return(0); } /* * Put low delay, telnet, rlogin and ftp control packets * in front of the queue or let ALTQ take care. */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) ; else if (_IF_QFULL(&sp->pp_fastq)) ; else if (ip->ip_tos & IPTOS_LOWDELAY) ifq = &sp->pp_fastq; else if (m->m_len < sizeof *ip + sizeof *tcp) ; else if (ip->ip_p != IPPROTO_TCP) ; else if (INTERACTIVE (ntohs (tcp->th_sport))) ifq = &sp->pp_fastq; else if (INTERACTIVE (ntohs (tcp->th_dport))) ifq = &sp->pp_fastq; /* * Do IP Header compression */ if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR && (sp->ipcp.flags & IPCP_VJ) && ip->ip_p == IPPROTO_TCP) switch (sl_compress_tcp(m, ip, sp->pp_comp, sp->ipcp.compress_cid)) { case TYPE_COMPRESSED_TCP: ipproto = PPP_VJ_COMP; break; case TYPE_UNCOMPRESSED_TCP: ipproto = PPP_VJ_UCOMP; break; case TYPE_IP: ipproto = PPP_IP; break; default: m_freem(m); SPPP_UNLOCK(sp); return (EINVAL); } } #endif #ifdef INET6 if (dst->sa_family == AF_INET6) { /* XXX do something tricky here? */ } #endif if (sp->pp_mode == PP_FR) { /* Add frame relay header. */ m = sppp_fr_header (sp, m, dst->sa_family); if (! m) goto nobufs; goto out; } /* * Prepend general data packet PPP header. For now, IP only. */ M_PREPEND (m, PPP_HEADER_LEN, M_NOWAIT); if (! m) { nobufs: if (debug) log(LOG_DEBUG, SPP_FMT "no memory for transmit header\n", SPP_ARGS(ifp)); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (ENOBUFS); } /* * May want to check size of packet * (albeit due to the implementation it's always enough) */ h = mtod (m, struct ppp_header*); if (sp->pp_mode == IFF_CISCO) { h->address = CISCO_UNICAST; /* unicast address */ h->control = 0; } else { h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ } switch (dst->sa_family) { #ifdef INET case AF_INET: /* Internet Protocol */ if (sp->pp_mode == IFF_CISCO) h->protocol = htons (ETHERTYPE_IP); else { /* * Don't choke with an ENETDOWN early. It's * possible that we just started dialing out, * so don't drop the packet immediately. If * we notice that we run out of buffer space * below, we will however remember that we are * not ready to carry IP packets, and return * ENETDOWN, as opposed to ENOBUFS. */ h->protocol = htons(ipproto); if (sp->state[IDX_IPCP] != STATE_OPENED) rv = ENETDOWN; } break; #endif #ifdef INET6 case AF_INET6: /* Internet Protocol */ if (sp->pp_mode == IFF_CISCO) h->protocol = htons (ETHERTYPE_IPV6); else { /* * Don't choke with an ENETDOWN early. It's * possible that we just started dialing out, * so don't drop the packet immediately. If * we notice that we run out of buffer space * below, we will however remember that we are * not ready to carry IP packets, and return * ENETDOWN, as opposed to ENOBUFS. */ h->protocol = htons(PPP_IPV6); if (sp->state[IDX_IPV6CP] != STATE_OPENED) rv = ENETDOWN; } break; #endif default: m_freem (m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (EAFNOSUPPORT); } /* * Queue message on interface, and start output if interface * not yet active. */ out: if (ifq != NULL) error = !(IF_HANDOFF_ADJ(ifq, m, ifp, 3)); else IFQ_HANDOFF_ADJ(ifp, m, 3, error); if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (rv? rv: ENOBUFS); } SPPP_UNLOCK(sp); /* * Unlike in sppp_input(), we can always bump the timestamp * here since sppp_output() is only called on behalf of * network-layer traffic; control-layer traffic is handled * by sppp_cp_send(). */ sp->pp_last_sent = time_uptime; return (0); } void sppp_attach(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); /* Initialize mtx lock */ mtx_init(&sp->mtx, "sppp", MTX_NETWORK_LOCK, MTX_DEF | MTX_RECURSE); /* Initialize keepalive handler. */ callout_init(&sp->keepalive_callout, 1); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); ifp->if_mtu = PP_MTU; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_output = sppp_output; #if 0 sp->pp_flags = PP_KEEPALIVE; #endif ifp->if_snd.ifq_maxlen = 32; sp->pp_fastq.ifq_maxlen = 32; sp->pp_cpq.ifq_maxlen = 20; sp->pp_loopcnt = 0; sp->pp_alivecnt = 0; bzero(&sp->pp_seq[0], sizeof(sp->pp_seq)); bzero(&sp->pp_rseq[0], sizeof(sp->pp_rseq)); sp->pp_phase = PHASE_DEAD; sp->pp_up = sppp_pp_up; sp->pp_down = sppp_pp_down; if(!mtx_initialized(&sp->pp_cpq.ifq_mtx)) mtx_init(&sp->pp_cpq.ifq_mtx, "sppp_cpq", NULL, MTX_DEF); if(!mtx_initialized(&sp->pp_fastq.ifq_mtx)) mtx_init(&sp->pp_fastq.ifq_mtx, "sppp_fastq", NULL, MTX_DEF); sp->pp_last_recv = sp->pp_last_sent = time_uptime; sp->confflags = 0; #ifdef INET sp->confflags |= CONF_ENABLE_VJ; #endif #ifdef INET6 sp->confflags |= CONF_ENABLE_IPV6; #endif callout_init(&sp->ifstart_callout, 1); sp->if_start = ifp->if_start; ifp->if_start = sppp_ifstart; sp->pp_comp = malloc(sizeof(struct slcompress), M_TEMP, M_WAITOK); sl_compress_init(sp->pp_comp, -1); sppp_lcp_init(sp); sppp_ipcp_init(sp); sppp_ipv6cp_init(sp); sppp_pap_init(sp); sppp_chap_init(sp); } void sppp_detach(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); int i; KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); /* Stop keepalive handler. */ if (!callout_drain(&sp->keepalive_callout)) callout_stop(&sp->keepalive_callout); for (i = 0; i < IDX_COUNT; i++) { if (!callout_drain(&sp->ch[i])) callout_stop(&sp->ch[i]); } if (!callout_drain(&sp->pap_my_to_ch)) callout_stop(&sp->pap_my_to_ch); mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); mtx_destroy(&sp->mtx); } /* * Flush the interface output queue. */ static void sppp_flush_unlocked(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); sppp_qflush ((struct ifqueue *)&SP2IFP(sp)->if_snd); sppp_qflush (&sp->pp_fastq); sppp_qflush (&sp->pp_cpq); } void sppp_flush(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); SPPP_LOCK(sp); sppp_flush_unlocked (ifp); SPPP_UNLOCK(sp); } /* * Check if the output queue is empty. */ int sppp_isempty(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); int empty; SPPP_LOCK(sp); empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head && !SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); return (empty); } /* * Get next packet to send. */ struct mbuf * sppp_dequeue(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; SPPP_LOCK(sp); /* * Process only the control protocol queue until we have at * least one NCP open. * * Do always serve all three queues in Cisco mode. */ IF_DEQUEUE(&sp->pp_cpq, m); if (m == NULL && (sppp_ncp_check(sp) || sp->pp_mode == IFF_CISCO || sp->pp_mode == PP_FR)) { IF_DEQUEUE(&sp->pp_fastq, m); if (m == NULL) IF_DEQUEUE (&SP2IFP(sp)->if_snd, m); } SPPP_UNLOCK(sp); return m; } /* * Pick the next packet, do not remove it from the queue. */ struct mbuf * sppp_pick(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; SPPP_LOCK(sp); m = sp->pp_cpq.ifq_head; if (m == NULL && (sp->pp_phase == PHASE_NETWORK || sp->pp_mode == IFF_CISCO || sp->pp_mode == PP_FR)) if ((m = sp->pp_fastq.ifq_head) == NULL) m = SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); return (m); } /* * Process an ioctl request. Called on low priority level. */ int sppp_ioctl(struct ifnet *ifp, IOCTL_CMD_T cmd, void *data) { struct ifreq *ifr = (struct ifreq*) data; struct sppp *sp = IFP2SP(ifp); int rv, going_up, going_down, newmode; SPPP_LOCK(sp); rv = 0; switch (cmd) { case SIOCAIFADDR: break; case SIOCSIFADDR: /* set the interface "up" when assigning an IP address */ ifp->if_flags |= IFF_UP; /* FALLTHROUGH */ case SIOCSIFFLAGS: going_up = ifp->if_flags & IFF_UP && (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0; going_down = (ifp->if_flags & IFF_UP) == 0 && ifp->if_drv_flags & IFF_DRV_RUNNING; newmode = ifp->if_flags & IFF_PASSIVE; if (!newmode) newmode = ifp->if_flags & IFF_AUTO; if (!newmode) newmode = ifp->if_flags & IFF_CISCO; ifp->if_flags &= ~(IFF_PASSIVE | IFF_AUTO | IFF_CISCO); ifp->if_flags |= newmode; if (!newmode) newmode = sp->pp_flags & PP_FR; if (newmode != sp->pp_mode) { going_down = 1; if (!going_up) going_up = ifp->if_drv_flags & IFF_DRV_RUNNING; } if (going_down) { if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR) lcp.Close(sp); else if (sp->pp_tlf) (sp->pp_tlf)(sp); sppp_flush_unlocked(ifp); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sp->pp_mode = newmode; } if (going_up) { if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR) lcp.Close(sp); sp->pp_mode = newmode; if (sp->pp_mode == 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } if ((sp->pp_mode == IFF_CISCO) || (sp->pp_mode == PP_FR)) { if (sp->pp_tls) (sp->pp_tls)(sp); ifp->if_drv_flags |= IFF_DRV_RUNNING; } } break; #ifdef SIOCSIFMTU #ifndef ifr_mtu #define ifr_mtu ifr_metric #endif case SIOCSIFMTU: if (ifr->ifr_mtu < 128 || ifr->ifr_mtu > sp->lcp.their_mru) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; break; #endif #ifdef SLIOCSETMTU case SLIOCSETMTU: if (*(short*)data < 128 || *(short*)data > sp->lcp.their_mru) return (EINVAL); ifp->if_mtu = *(short*)data; break; #endif #ifdef SIOCGIFMTU case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; #endif #ifdef SLIOCGETMTU case SLIOCGETMTU: *(short*)data = ifp->if_mtu; break; #endif case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFGENERIC: case SIOCSIFGENERIC: rv = sppp_params(sp, cmd, data); break; default: rv = ENOTTY; } SPPP_UNLOCK(sp); return rv; } /* * Cisco framing implementation. */ /* * Handle incoming Cisco keepalive protocol packets. */ static void sppp_cisco_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct cisco_packet *h; u_long me, mymask; if (m->m_pkthdr.len < CISCO_PACKET_LEN) { if (debug) log(LOG_DEBUG, SPP_FMT "cisco invalid packet length: %d bytes\n", SPP_ARGS(ifp), m->m_pkthdr.len); return; } h = mtod (m, struct cisco_packet*); if (debug) log(LOG_DEBUG, SPP_FMT "cisco input: %d bytes " "<0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", SPP_ARGS(ifp), m->m_pkthdr.len, (u_long)ntohl (h->type), (u_long)h->par1, (u_long)h->par2, (u_int)h->rel, (u_int)h->time0, (u_int)h->time1); switch (ntohl (h->type)) { default: if (debug) log(-1, SPP_FMT "cisco unknown packet type: 0x%lx\n", SPP_ARGS(ifp), (u_long)ntohl (h->type)); break; case CISCO_ADDR_REPLY: /* Reply on address request, ignore */ break; case CISCO_KEEPALIVE_REQ: sp->pp_alivecnt = 0; sp->pp_rseq[IDX_LCP] = ntohl (h->par1); if (sp->pp_seq[IDX_LCP] == sp->pp_rseq[IDX_LCP]) { /* Local and remote sequence numbers are equal. * Probably, the line is in loopback mode. */ if (sp->pp_loopcnt >= MAXALIVECNT) { printf (SPP_FMT "loopback\n", SPP_ARGS(ifp)); sp->pp_loopcnt = 0; if (ifp->if_flags & IFF_UP) { if_down (ifp); sppp_qflush (&sp->pp_cpq); } } ++sp->pp_loopcnt; /* Generate new local sequence number */ sp->pp_seq[IDX_LCP] = random(); break; } sp->pp_loopcnt = 0; if (! (ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { if_up(ifp); printf (SPP_FMT "up\n", SPP_ARGS(ifp)); } break; case CISCO_ADDR_REQ: sppp_get_ip_addrs(sp, &me, 0, &mymask); if (me != 0L) sppp_cisco_send(sp, CISCO_ADDR_REPLY, me, mymask); break; } } /* * Send Cisco keepalive packet. */ static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2) { STDDCL; struct ppp_header *h; struct cisco_packet *ch; struct mbuf *m; struct timeval tv; getmicrouptime(&tv); MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + CISCO_PACKET_LEN; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = CISCO_MULTICAST; h->control = 0; h->protocol = htons (CISCO_KEEPALIVE); ch = (struct cisco_packet*) (h + 1); ch->type = htonl (type); ch->par1 = htonl (par1); ch->par2 = htonl (par2); ch->rel = -1; ch->time0 = htons ((u_short) (tv.tv_sec >> 16)); ch->time1 = htons ((u_short) tv.tv_sec); if (debug) log(LOG_DEBUG, SPP_FMT "cisco output: <0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", SPP_ARGS(ifp), (u_long)ntohl (ch->type), (u_long)ch->par1, (u_long)ch->par2, (u_int)ch->rel, (u_int)ch->time0, (u_int)ch->time1); if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * PPP protocol implementation. */ /* * Send PPP control protocol packet. */ static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type, u_char ident, u_short len, void *data) { STDDCL; struct ppp_header *h; struct lcp_header *lh; struct mbuf *m; if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) len = MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN; MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ h->protocol = htons (proto); /* Link Control Protocol */ lh = (struct lcp_header*) (h + 1); lh->type = type; lh->ident = ident; lh->len = htons (LCP_HEADER_LEN + len); if (len) bcopy (data, lh+1, len); if (debug) { log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_proto_name(proto), sppp_cp_type_name (lh->type), lh->ident, ntohs (lh->len)); sppp_print_bytes ((u_char*) (lh+1), len); log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Handle incoming PPP control protocol packets. */ static void sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len = m->m_pkthdr.len; int rv; u_char *p; if (len < 4) { if (debug) log(LOG_DEBUG, SPP_FMT "%s invalid packet length: %d bytes\n", SPP_ARGS(ifp), cp->name, len); return; } h = mtod (m, struct lcp_header*); if (debug) { log(LOG_DEBUG, SPP_FMT "%s input(%s): <%s id=0x%x len=%d", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx]), sppp_cp_type_name (h->type), h->ident, ntohs (h->len)); sppp_print_bytes ((u_char*) (h+1), len-4); log(-1, ">\n"); } if (len > ntohs (h->len)) len = ntohs (h->len); p = (u_char *)(h + 1); switch (h->type) { case CONF_REQ: if (len < 4) { if (debug) log(-1, SPP_FMT "%s invalid conf-req length %d\n", SPP_ARGS(ifp), cp->name, len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } /* handle states where RCR doesn't get a SCA/SCN */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: case STATE_STOPPING: return; case STATE_CLOSED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); return; } rv = (cp->RCR)(sp, h, len); switch (sp->state[cp->protoidx]) { case STATE_OPENED: (cp->tld)(sp); (cp->scr)(sp); /* FALLTHROUGH */ case STATE_ACK_SENT: case STATE_REQ_SENT: /* * sppp_cp_change_state() have the side effect of * restarting the timeouts. We want to avoid that * if the state don't change, otherwise we won't * ever timeout and resend a configuration request * that got lost. */ if (sp->state[cp->protoidx] == (rv ? STATE_ACK_SENT: STATE_REQ_SENT)) break; sppp_cp_change_state(cp, sp, rv? STATE_ACK_SENT: STATE_REQ_SENT); break; case STATE_STOPPED: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, rv? STATE_ACK_SENT: STATE_REQ_SENT); break; case STATE_ACK_RCVD: if (rv) { sppp_cp_change_state(cp, sp, STATE_OPENED); if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), cp->name); (cp->tlu)(sp); } else sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_ACK: if (h->ident != sp->confid[cp->protoidx]) { if (debug) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_CLOSING: case STATE_STOPPING: break; case STATE_REQ_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_ACK_RCVD: (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; sppp_cp_change_state(cp, sp, STATE_OPENED); if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), cp->name); (cp->tlu)(sp); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_NAK: case CONF_REJ: if (h->ident != sp->confid[cp->protoidx]) { if (debug) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (h->type == CONF_NAK) (cp->RCN_nak)(sp, h, len); else /* CONF_REJ */ (cp->RCN_rej)(sp, h, len); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_REQ_SENT: case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; /* * Slow things down a bit if we think we might be * in loopback. Depend on the timeout to send the * next configuration request. */ if (sp->pp_loopcnt) break; (cp->scr)(sp); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); (cp->scr)(sp); break; case STATE_CLOSING: case STATE_STOPPING: break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case TERM_REQ: switch (sp->state[cp->protoidx]) { case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); /* FALLTHROUGH */ case STATE_CLOSED: case STATE_STOPPED: case STATE_CLOSING: case STATE_STOPPING: case STATE_REQ_SENT: sta: /* Send Terminate-Ack packet. */ if (debug) log(LOG_DEBUG, SPP_FMT "%s send terminate-ack\n", SPP_ARGS(ifp), cp->name); sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_OPENED: (cp->tld)(sp); sp->rst_counter[cp->protoidx] = 0; sppp_cp_change_state(cp, sp, STATE_STOPPING); goto sta; break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case TERM_ACK: switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: break; case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_CLOSED); (cp->tlf)(sp); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_OPENED: (cp->tld)(sp); (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CODE_REJ: /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ log(LOG_INFO, SPP_FMT "%s: ignoring RXJ (%s) for proto 0x%x, " "danger will robinson\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), ntohs(*((u_short *)p))); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: case STATE_CLOSING: case STATE_STOPPING: case STATE_OPENED: break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case PROTO_REJ: { int catastrophic; const struct cp *upper; int i; u_int16_t proto; catastrophic = 0; upper = NULL; proto = ntohs(*((u_int16_t *)p)); for (i = 0; i < IDX_COUNT; i++) { if (cps[i]->proto == proto) { upper = cps[i]; break; } } if (upper == NULL) catastrophic++; if (catastrophic || debug) log(catastrophic? LOG_INFO: LOG_DEBUG, SPP_FMT "%s: RXJ%c (%s) for proto 0x%x (%s/%s)\n", SPP_ARGS(ifp), cp->name, catastrophic ? '-' : '+', sppp_cp_type_name(h->type), proto, upper ? upper->name : "unknown", upper ? sppp_state_name(sp->state[upper->protoidx]) : "?"); /* * if we got RXJ+ against conf-req, the peer does not implement * this particular protocol type. terminate the protocol. */ if (upper && !catastrophic) { if (sp->state[upper->protoidx] == STATE_REQ_SENT) { upper->Close(sp); break; } } /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: case STATE_CLOSING: case STATE_STOPPING: case STATE_OPENED: break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; } case DISC_REQ: if (cp->proto != PPP_LCP) goto illegal; /* Discard the packet. */ break; case ECHO_REQ: if (cp->proto != PPP_LCP) goto illegal; if (sp->state[cp->protoidx] != STATE_OPENED) { if (debug) log(-1, SPP_FMT "lcp echo req but lcp closed\n", SPP_ARGS(ifp)); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { if (debug) log(-1, SPP_FMT "invalid lcp echo request " "packet length: %d bytes\n", SPP_ARGS(ifp), len); break; } if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && ntohl (*(long*)(h+1)) == sp->lcp.magic) { /* Line loopback mode detected. */ printf(SPP_FMT "loopback\n", SPP_ARGS(ifp)); sp->pp_loopcnt = MAXALIVECNT * 5; if_down (ifp); sppp_qflush (&sp->pp_cpq); /* Shut down the PPP link. */ /* XXX */ lcp.Down(sp); lcp.Up(sp); break; } *(long*)(h+1) = htonl (sp->lcp.magic); if (debug) log(-1, SPP_FMT "got lcp echo req, sending echo rep\n", SPP_ARGS(ifp)); sppp_cp_send (sp, PPP_LCP, ECHO_REPLY, h->ident, len-4, h+1); break; case ECHO_REPLY: if (cp->proto != PPP_LCP) goto illegal; if (h->ident != sp->lcp.echoid) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { if (debug) log(-1, SPP_FMT "lcp invalid echo reply " "packet length: %d bytes\n", SPP_ARGS(ifp), len); break; } if (debug) log(-1, SPP_FMT "lcp got echo rep\n", SPP_ARGS(ifp)); if (!(sp->lcp.opts & (1 << LCP_OPT_MAGIC)) || ntohl (*(long*)(h+1)) != sp->lcp.magic) sp->pp_alivecnt = 0; break; default: /* Unknown packet type -- send Code-Reject packet. */ illegal: if (debug) log(-1, SPP_FMT "%s send code-rej for 0x%x\n", SPP_ARGS(ifp), cp->name, h->type); sppp_cp_send(sp, cp->proto, CODE_REJ, ++sp->pp_seq[cp->protoidx], m->m_pkthdr.len, h); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } } /* * The generic part of all Up/Down/Open/Close/TO event handlers. * Basically, the state transition handling in the automaton. */ static void sppp_up_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s up(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: sppp_cp_change_state(cp, sp, STATE_CLOSED); break; case STATE_STARTING: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal up in state %s\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); } } static void sppp_down_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s down(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_INITIAL); break; case STATE_STOPPED: sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_STARTING); break; case STATE_OPENED: (cp->tld)(sp); sppp_cp_change_state(cp, sp, STATE_STARTING); break; default: printf(SPP_FMT "%s illegal down in state %s\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); } } static void sppp_open_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s open(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STARTING: break; case STATE_CLOSED: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_STOPPED: /* * Try escaping stopped state. This seems to bite * people occasionally, in particular for IPCP, * presumably following previous IPCP negotiation * aborts. Somehow, we must have missed a Down event * which would have caused a transition into starting * state, so as a bandaid we force the Down event now. * This effectively implements (something like the) * `restart' option mentioned in the state transition * table of RFC 1661. */ sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: case STATE_OPENED: break; case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_STOPPING); break; } } static void sppp_close_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s close(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: case STATE_CLOSED: case STATE_CLOSING: break; case STATE_STARTING: sppp_cp_change_state(cp, sp, STATE_INITIAL); (cp->tlf)(sp); break; case STATE_STOPPED: sppp_cp_change_state(cp, sp, STATE_CLOSED); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_CLOSING); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_terminate; sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); sppp_cp_change_state(cp, sp, STATE_CLOSING); break; } } static void sppp_to_event(const struct cp *cp, struct sppp *sp) { STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "%s TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx]), sp->rst_counter[cp->protoidx]); if (--sp->rst_counter[cp->protoidx] < 0) /* TO- event */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_CLOSED); (cp->tlf)(sp); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; } else /* TO+ event */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: case STATE_STOPPING: sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: (cp->scr)(sp); /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_ACK_SENT: (cp->scr)(sp); callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; } SPPP_UNLOCK(sp); } /* * Change the state of a control protocol in the state automaton. * Takes care of starting/stopping the restart timer. */ static void sppp_cp_change_state(const struct cp *cp, struct sppp *sp, int newstate) { sp->state[cp->protoidx] = newstate; callout_stop (&sp->ch[cp->protoidx]); switch (newstate) { case STATE_INITIAL: case STATE_STARTING: case STATE_CLOSED: case STATE_STOPPED: case STATE_OPENED: break; case STATE_CLOSING: case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; } } /* *--------------------------------------------------------------------------* * * * The LCP implementation. * * * *--------------------------------------------------------------------------* */ static void sppp_pp_up(struct sppp *sp) { SPPP_LOCK(sp); lcp.Up(sp); SPPP_UNLOCK(sp); } static void sppp_pp_down(struct sppp *sp) { SPPP_LOCK(sp); lcp.Down(sp); SPPP_UNLOCK(sp); } static void sppp_lcp_init(struct sppp *sp) { sp->lcp.opts = (1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; sp->state[IDX_LCP] = STATE_INITIAL; sp->fail_counter[IDX_LCP] = 0; sp->pp_seq[IDX_LCP] = 0; sp->pp_rseq[IDX_LCP] = 0; sp->lcp.protos = 0; sp->lcp.mru = sp->lcp.their_mru = PP_MTU; /* Note that these values are relevant for all control protocols */ sp->lcp.timeout = 3 * hz; sp->lcp.max_terminate = 2; sp->lcp.max_configure = 10; sp->lcp.max_failure = 10; callout_init(&sp->ch[IDX_LCP], 1); } static void sppp_lcp_up(struct sppp *sp) { STDDCL; sp->pp_alivecnt = 0; sp->lcp.opts = (1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; sp->lcp.protos = 0; sp->lcp.mru = sp->lcp.their_mru = PP_MTU; /* * If we are authenticator, negotiate LCP_AUTH */ if (sp->hisauth.proto != 0) sp->lcp.opts |= (1 << LCP_OPT_AUTH_PROTO); else sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); sp->pp_flags &= ~PP_NEEDAUTH; /* * If this interface is passive or dial-on-demand, and we are * still in Initial state, it means we've got an incoming * call. Activate the interface. */ if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) != 0) { if (debug) log(LOG_DEBUG, SPP_FMT "Up event", SPP_ARGS(ifp)); ifp->if_drv_flags |= IFF_DRV_RUNNING; if (sp->state[IDX_LCP] == STATE_INITIAL) { if (debug) log(-1, "(incoming call)\n"); sp->pp_flags |= PP_CALLIN; lcp.Open(sp); } else if (debug) log(-1, "\n"); } else if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0 && (sp->state[IDX_LCP] == STATE_INITIAL)) { ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } sppp_up_event(&lcp, sp); } static void sppp_lcp_down(struct sppp *sp) { STDDCL; sppp_down_event(&lcp, sp); /* * If this is neither a dial-on-demand nor a passive * interface, simulate an ``ifconfig down'' action, so the * administrator can force a redial by another ``ifconfig * up''. XXX For leased line operation, should we immediately * try to reopen the connection here? */ if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0) { log(LOG_INFO, SPP_FMT "Down event, taking interface down.\n", SPP_ARGS(ifp)); if_down(ifp); } else { if (debug) log(LOG_DEBUG, SPP_FMT "Down event (carrier loss)\n", SPP_ARGS(ifp)); sp->pp_flags &= ~PP_CALLIN; if (sp->state[IDX_LCP] != STATE_INITIAL) lcp.Close(sp); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } } static void sppp_lcp_open(struct sppp *sp) { sppp_open_event(&lcp, sp); } static void sppp_lcp_close(struct sppp *sp) { sppp_close_event(&lcp, sp); } static void sppp_lcp_TO(void *cookie) { sppp_to_event(&lcp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *r, *p; int origlen, rlen; u_long nmagic; u_short authproto; len -= 4; origlen = len; buf = r = malloc (len, M_TEMP, M_NOWAIT); if (! buf) return (0); if (debug) log(LOG_DEBUG, SPP_FMT "lcp parse opts: ", SPP_ARGS(ifp)); /* pass 1: check for things that need to be rejected */ p = (void*) (h+1); for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number. */ if (len >= 6 && p[1] == 6) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_ASYNC_MAP: /* Async control character map. */ if (len >= 6 && p[1] == 6) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_MRU: /* Maximum receive unit. */ if (len >= 4 && p[1] == 4) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_AUTH_PROTO: if (len < 4) { if (debug) log(-1, "[invalid] "); break; } authproto = (p[2] << 8) + p[3]; if (authproto == PPP_CHAP && p[1] != 5) { if (debug) log(-1, "[invalid chap len] "); break; } if (sp->myauth.proto == 0) { /* we are not configured to do auth */ if (debug) log(-1, "[not configured] "); break; } /* * Remote want us to authenticate, remember this, * so we stay in PHASE_AUTHENTICATE after LCP got * up. */ sp->pp_flags |= PP_NEEDAUTH; continue; default: /* Others not supported. */ if (debug) log(-1, "[rej] "); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); return 0; } else if (debug) log(-1, "\n"); /* * pass 2: check for option values that are unacceptable and * thus require to be nak'ed. */ if (debug) log(LOG_DEBUG, SPP_FMT "lcp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- extract. */ nmagic = (u_long)p[2] << 24 | (u_long)p[3] << 16 | p[4] << 8 | p[5]; if (nmagic != sp->lcp.magic) { sp->pp_loopcnt = 0; if (debug) log(-1, "0x%lx ", nmagic); continue; } if (debug && sp->pp_loopcnt < MAXALIVECNT*5) log(-1, "[glitch] "); ++sp->pp_loopcnt; /* * We negate our magic here, and NAK it. If * we see it later in an NAK packet, we * suggest a new one. */ nmagic = ~sp->lcp.magic; /* Gonna NAK it. */ p[2] = nmagic >> 24; p[3] = nmagic >> 16; p[4] = nmagic >> 8; p[5] = nmagic; break; case LCP_OPT_ASYNC_MAP: /* * Async control character map -- just ignore it. * * Quote from RFC 1662, chapter 6: * To enable this functionality, synchronous PPP * implementations MUST always respond to the * Async-Control-Character-Map Configuration * Option with the LCP Configure-Ack. However, * acceptance of the Configuration Option does * not imply that the synchronous implementation * will do any ACCM mapping. Instead, all such * octet mapping will be performed by the * asynchronous-to-synchronous converter. */ continue; case LCP_OPT_MRU: /* * Maximum receive unit. Always agreeable, * but ignored by now. */ sp->lcp.their_mru = p[2] * 256 + p[3]; if (debug) log(-1, "%lu ", sp->lcp.their_mru); continue; case LCP_OPT_AUTH_PROTO: authproto = (p[2] << 8) + p[3]; if (sp->myauth.proto != authproto) { /* not agreed, nak */ if (debug) log(-1, "[mine %s != his %s] ", sppp_proto_name(sp->hisauth.proto), sppp_proto_name(authproto)); p[2] = sp->myauth.proto >> 8; p[3] = sp->myauth.proto; break; } if (authproto == PPP_CHAP && p[4] != CHAP_MD5) { if (debug) log(-1, "[chap not MD5] "); p[4] = CHAP_MD5; break; } continue; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { /* * Local and remote magics equal -- loopback? */ if (sp->pp_loopcnt >= MAXALIVECNT*5) { if (sp->pp_loopcnt == MAXALIVECNT*5) printf (SPP_FMT "loopback\n", SPP_ARGS(ifp)); if (ifp->if_flags & IFF_UP) { if_down(ifp); sppp_qflush(&sp->pp_cpq); /* XXX ? */ lcp.Down(sp); lcp.Up(sp); } } else if (!sp->pp_loopcnt && ++sp->fail_counter[IDX_LCP] >= sp->lcp.max_failure) { if (debug) log(-1, " max_failure (%d) exceeded, " "send conf-rej\n", sp->lcp.max_failure); sppp_cp_send(sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); } else { if (debug) log(-1, " send conf-nak\n"); sppp_cp_send (sp, PPP_LCP, CONF_NAK, h->ident, rlen, buf); } } else { if (debug) log(-1, " send conf-ack\n"); sp->fail_counter[IDX_LCP] = 0; sp->pp_loopcnt = 0; sppp_cp_send (sp, PPP_LCP, CONF_ACK, h->ident, origlen, h+1); } free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the LCP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *p; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "lcp rej opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- can't use it, use 0 */ sp->lcp.opts &= ~(1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; break; case LCP_OPT_MRU: /* * Should not be rejected anyway, since we only * negotiate a MRU if explicitly requested by * peer. */ sp->lcp.opts &= ~(1 << LCP_OPT_MRU); break; case LCP_OPT_AUTH_PROTO: /* * Peer doesn't want to authenticate himself, * deny unless this is a dialout call, and * AUTHFLAG_NOCALLOUT is set. */ if ((sp->pp_flags & PP_CALLIN) == 0 && (sp->hisauth.flags & AUTHFLAG_NOCALLOUT) != 0) { if (debug) log(-1, "[don't insist on auth " "for callout]"); sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); break; } if (debug) log(-1, "[access denied]\n"); lcp.Close(sp); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the LCP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *p; u_long magic; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "lcp nak opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- renegotiate */ if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && len >= 6 && p[1] == 6) { magic = (u_long)p[2] << 24 | (u_long)p[3] << 16 | p[4] << 8 | p[5]; /* * If the remote magic is our negated one, * this looks like a loopback problem. * Suggest a new magic to make sure. */ if (magic == ~sp->lcp.magic) { if (debug) log(-1, "magic glitch "); sp->lcp.magic = random(); } else { sp->lcp.magic = magic; if (debug) log(-1, "%lu ", magic); } } break; case LCP_OPT_MRU: /* * Peer wants to advise us to negotiate an MRU. * Agree on it if it's reasonable, or use * default otherwise. */ if (len >= 4 && p[1] == 4) { u_int mru = p[2] * 256 + p[3]; if (debug) log(-1, "%d ", mru); if (mru < PP_MTU || mru > PP_MAX_MRU) mru = PP_MTU; sp->lcp.mru = mru; sp->lcp.opts |= (1 << LCP_OPT_MRU); } break; case LCP_OPT_AUTH_PROTO: /* * Peer doesn't like our authentication method, * deny. */ if (debug) log(-1, "[access denied]\n"); lcp.Close(sp); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_lcp_tlu(struct sppp *sp) { STDDCL; int i; u_long mask; /* XXX ? */ if (! (ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* Coming out of loopback mode. */ if_up(ifp); printf (SPP_FMT "up\n", SPP_ARGS(ifp)); } for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_QUAL) (cps[i])->Open(sp); if ((sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0 || (sp->pp_flags & PP_NEEDAUTH) != 0) sp->pp_phase = PHASE_AUTHENTICATE; else sp->pp_phase = PHASE_NETWORK; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* * Open all authentication protocols. This is even required * if we already proceeded to network phase, since it might be * that remote wants us to authenticate, so we might have to * send a PAP request. Undesired authentication protocols * don't do anything when they get an Open event. */ for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_AUTH) (cps[i])->Open(sp); if (sp->pp_phase == PHASE_NETWORK) { /* Notify all NCPs. */ for (i = 0; i < IDX_COUNT; i++) if (((cps[i])->flags & CP_NCP) && /* * XXX * Hack to administratively disable IPv6 if * not desired. Perhaps we should have another * flag for this, but right now, we can make * all struct cp's read/only. */ (cps[i] != &ipv6cp || (sp->confflags & CONF_ENABLE_IPV6))) (cps[i])->Open(sp); } /* Send Up events to all started protos. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) (cps[i])->Up(sp); /* notify low-level driver of state change */ if (sp->pp_chg) sp->pp_chg(sp, (int)sp->pp_phase); if (sp->pp_phase == PHASE_NETWORK) /* if no NCP is starting, close down */ sppp_lcp_check_and_close(sp); } static void sppp_lcp_tld(struct sppp *sp) { STDDCL; int i; u_long mask; sp->pp_phase = PHASE_TERMINATE; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* * Take upper layers down. We send the Down event first and * the Close second to prevent the upper layers from sending * ``a flurry of terminate-request packets'', as the RFC * describes it. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) { (cps[i])->Down(sp); (cps[i])->Close(sp); } } static void sppp_lcp_tls(struct sppp *sp) { STDDCL; sp->pp_phase = PHASE_ESTABLISH; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify lower layer if desired. */ if (sp->pp_tls) (sp->pp_tls)(sp); else (sp->pp_up)(sp); } static void sppp_lcp_tlf(struct sppp *sp) { STDDCL; sp->pp_phase = PHASE_DEAD; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify lower layer if desired. */ if (sp->pp_tlf) (sp->pp_tlf)(sp); else (sp->pp_down)(sp); } static void sppp_lcp_scr(struct sppp *sp) { char opt[6 /* magicnum */ + 4 /* mru */ + 5 /* chap */]; int i = 0; u_short authproto; if (sp->lcp.opts & (1 << LCP_OPT_MAGIC)) { if (! sp->lcp.magic) sp->lcp.magic = random(); opt[i++] = LCP_OPT_MAGIC; opt[i++] = 6; opt[i++] = sp->lcp.magic >> 24; opt[i++] = sp->lcp.magic >> 16; opt[i++] = sp->lcp.magic >> 8; opt[i++] = sp->lcp.magic; } if (sp->lcp.opts & (1 << LCP_OPT_MRU)) { opt[i++] = LCP_OPT_MRU; opt[i++] = 4; opt[i++] = sp->lcp.mru >> 8; opt[i++] = sp->lcp.mru; } if (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) { authproto = sp->hisauth.proto; opt[i++] = LCP_OPT_AUTH_PROTO; opt[i++] = authproto == PPP_CHAP? 5: 4; opt[i++] = authproto >> 8; opt[i++] = authproto; if (authproto == PPP_CHAP) opt[i++] = CHAP_MD5; } sp->confid[IDX_LCP] = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, CONF_REQ, sp->confid[IDX_LCP], i, &opt); } /* * Check the open NCPs, return true if at least one NCP is open. */ static int sppp_ncp_check(struct sppp *sp) { int i, mask; for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && (cps[i])->flags & CP_NCP) return 1; return 0; } /* * Re-check the open NCPs and see if we should terminate the link. * Called by the NCPs during their tlf action handling. */ static void sppp_lcp_check_and_close(struct sppp *sp) { if (sp->pp_phase < PHASE_NETWORK) /* don't bother, we are already going down */ return; if (sppp_ncp_check(sp)) return; lcp.Close(sp); } /* *--------------------------------------------------------------------------* * * * The IPCP implementation. * * * *--------------------------------------------------------------------------* */ #ifdef INET static void sppp_ipcp_init(struct sppp *sp) { sp->ipcp.opts = 0; sp->ipcp.flags = 0; sp->state[IDX_IPCP] = STATE_INITIAL; sp->fail_counter[IDX_IPCP] = 0; sp->pp_seq[IDX_IPCP] = 0; sp->pp_rseq[IDX_IPCP] = 0; callout_init(&sp->ch[IDX_IPCP], 1); } static void sppp_ipcp_up(struct sppp *sp) { sppp_up_event(&ipcp, sp); } static void sppp_ipcp_down(struct sppp *sp) { sppp_down_event(&ipcp, sp); } static void sppp_ipcp_open(struct sppp *sp) { STDDCL; u_long myaddr, hisaddr; sp->ipcp.flags &= ~(IPCP_HISADDR_SEEN | IPCP_MYADDR_SEEN | IPCP_MYADDR_DYN | IPCP_VJ); sp->ipcp.opts = 0; sppp_get_ip_addrs(sp, &myaddr, &hisaddr, 0); /* * If we don't have his address, this probably means our * interface doesn't want to talk IP at all. (This could * be the case if somebody wants to speak only IPX, for * example.) Don't open IPCP in this case. */ if (hisaddr == 0L) { /* XXX this message should go away */ if (debug) log(LOG_DEBUG, SPP_FMT "ipcp_open(): no IP interface\n", SPP_ARGS(ifp)); return; } if (myaddr == 0L) { /* * I don't have an assigned address, so i need to * negotiate my address. */ sp->ipcp.flags |= IPCP_MYADDR_DYN; sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); } else sp->ipcp.flags |= IPCP_MYADDR_SEEN; if (sp->confflags & CONF_ENABLE_VJ) { sp->ipcp.opts |= (1 << IPCP_OPT_COMPRESSION); sp->ipcp.max_state = MAX_STATES - 1; sp->ipcp.compress_cid = 1; } sppp_open_event(&ipcp, sp); } static void sppp_ipcp_close(struct sppp *sp) { sppp_close_event(&ipcp, sp); if (sp->ipcp.flags & IPCP_MYADDR_DYN) /* * My address was dynamic, clear it again. */ sppp_set_ip_addr(sp, 0L); } static void sppp_ipcp_TO(void *cookie) { sppp_to_event(&ipcp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *r, *p; struct ifnet *ifp = SP2IFP(sp); int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; u_long hisaddr, desiredaddr; int gotmyaddr = 0; int desiredcomp; len -= 4; origlen = len; /* * Make sure to allocate a buf that can at least hold a * conf-nak with an `address' option. We might need it below. */ buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); if (! buf) return (0); /* pass 1: see if we can recognize them */ if (debug) log(LOG_DEBUG, SPP_FMT "ipcp parse opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: if (!(sp->confflags & CONF_ENABLE_VJ)) { /* VJ compression administratively disabled */ if (debug) log(-1, "[locally disabled] "); break; } /* * In theory, we should only conf-rej an * option that is shorter than RFC 1618 * requires (i.e. < 4), and should conf-nak * anything else that is not VJ. However, * since our algorithm always uses the * original option to NAK it with new values, * things would become more complicated. In * pratice, the only commonly implemented IP * compression option is VJ anyway, so the * difference is negligible. */ if (len >= 6 && p[1] == 6) { /* * correctly formed compression option * that could be VJ compression */ continue; } if (debug) log(-1, "optlen %d [invalid/unsupported] ", p[1]); break; case IPCP_OPT_ADDRESS: if (len >= 6 && p[1] == 6) { /* correctly formed address option */ continue; } if (debug) log(-1, "[invalid] "); break; default: /* Others not supported. */ if (debug) log(-1, "[rej] "); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_IPCP, CONF_REJ, h->ident, rlen, buf); return 0; } else if (debug) log(-1, "\n"); /* pass 2: parse option values */ sppp_get_ip_addrs(sp, 0, &hisaddr, 0); if (debug) log(LOG_DEBUG, SPP_FMT "ipcp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: desiredcomp = p[2] << 8 | p[3]; /* We only support VJ */ if (desiredcomp == IPCP_COMP_VJ) { if (debug) log(-1, "VJ [ack] "); sp->ipcp.flags |= IPCP_VJ; sl_compress_init(sp->pp_comp, p[4]); sp->ipcp.max_state = p[4]; sp->ipcp.compress_cid = p[5]; continue; } if (debug) log(-1, "compproto %#04x [not supported] ", desiredcomp); p[2] = IPCP_COMP_VJ >> 8; p[3] = IPCP_COMP_VJ; p[4] = sp->ipcp.max_state; p[5] = sp->ipcp.compress_cid; break; case IPCP_OPT_ADDRESS: /* This is the address he wants in his end */ desiredaddr = p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; if (desiredaddr == hisaddr || (hisaddr >= 1 && hisaddr <= 254 && desiredaddr != 0)) { /* * Peer's address is same as our value, * or we have set it to 0.0.0.* to * indicate that we do not really care, * this is agreeable. Gonna conf-ack * it. */ if (debug) log(-1, "%s [ack] ", sppp_dotted_quad(hisaddr)); /* record that we've seen it already */ sp->ipcp.flags |= IPCP_HISADDR_SEEN; continue; } /* * The address wasn't agreeable. This is either * he sent us 0.0.0.0, asking to assign him an * address, or he send us another address not * matching our value. Either case, we gonna * conf-nak it with our value. * XXX: we should "rej" if hisaddr == 0 */ if (debug) { if (desiredaddr == 0) log(-1, "[addr requested] "); else log(-1, "%s [not agreed] ", sppp_dotted_quad(desiredaddr)); } p[2] = hisaddr >> 24; p[3] = hisaddr >> 16; p[4] = hisaddr >> 8; p[5] = hisaddr; break; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } /* * If we are about to conf-ack the request, but haven't seen * his address so far, gonna conf-nak it instead, with the * `address' option present and our idea of his address being * filled in there, to request negotiation of both addresses. * * XXX This can result in an endless req - nak loop if peer * doesn't want to send us his address. Q: What should we do * about it? XXX A: implement the max-failure counter. */ if (rlen == 0 && !(sp->ipcp.flags & IPCP_HISADDR_SEEN) && !gotmyaddr) { buf[0] = IPCP_OPT_ADDRESS; buf[1] = 6; buf[2] = hisaddr >> 24; buf[3] = hisaddr >> 16; buf[4] = hisaddr >> 8; buf[5] = hisaddr; rlen = 6; if (debug) log(-1, "still need hisaddr "); } if (rlen) { if (debug) log(-1, " send conf-nak\n"); sppp_cp_send (sp, PPP_IPCP, CONF_NAK, h->ident, rlen, buf); } else { if (debug) log(-1, " send conf-ack\n"); sppp_cp_send (sp, PPP_IPCP, CONF_ACK, h->ident, origlen, h+1); } free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the IPCP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipcp rej opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION); break; case IPCP_OPT_ADDRESS: /* * Peer doesn't grok address option. This is * bad. XXX Should we better give up here? * XXX We could try old "addresses" option... */ sp->ipcp.opts &= ~(1 << IPCP_OPT_ADDRESS); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the IPCP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; int desiredcomp; u_long wantaddr; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipcp nak opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: if (len >= 6 && p[1] == 6) { desiredcomp = p[2] << 8 | p[3]; if (debug) log(-1, "[wantcomp %#04x] ", desiredcomp); if (desiredcomp == IPCP_COMP_VJ) { sl_compress_init(sp->pp_comp, p[4]); sp->ipcp.max_state = p[4]; sp->ipcp.compress_cid = p[5]; if (debug) log(-1, "[agree] "); } else sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION); } break; case IPCP_OPT_ADDRESS: /* * Peer doesn't like our local IP address. See * if we can do something for him. We'll drop * him our address then. */ if (len >= 6 && p[1] == 6) { wantaddr = p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); if (debug) log(-1, "[wantaddr %s] ", sppp_dotted_quad(wantaddr)); /* * When doing dynamic address assignment, * we accept his offer. Otherwise, we * ignore it and thus continue to negotiate * our already existing value. * XXX: Bogus, if he said no once, he'll * just say no again, might as well die. */ if (sp->ipcp.flags & IPCP_MYADDR_DYN) { sppp_set_ip_addr(sp, wantaddr); if (debug) log(-1, "[agree] "); sp->ipcp.flags |= IPCP_MYADDR_SEEN; } } break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_ipcp_tlu(struct sppp *sp) { /* we are up - notify isdn daemon */ if (sp->pp_con) sp->pp_con(sp); } static void sppp_ipcp_tld(struct sppp *sp) { } static void sppp_ipcp_tls(struct sppp *sp) { /* indicate to LCP that it must stay alive */ sp->lcp.protos |= (1 << IDX_IPCP); } static void sppp_ipcp_tlf(struct sppp *sp) { /* we no longer need LCP */ sp->lcp.protos &= ~(1 << IDX_IPCP); sppp_lcp_check_and_close(sp); } static void sppp_ipcp_scr(struct sppp *sp) { char opt[6 /* compression */ + 6 /* address */]; u_long ouraddr; int i = 0; if (sp->ipcp.opts & (1 << IPCP_OPT_COMPRESSION)) { opt[i++] = IPCP_OPT_COMPRESSION; opt[i++] = 6; opt[i++] = IPCP_COMP_VJ >> 8; opt[i++] = IPCP_COMP_VJ; opt[i++] = sp->ipcp.max_state; opt[i++] = sp->ipcp.compress_cid; } if (sp->ipcp.opts & (1 << IPCP_OPT_ADDRESS)) { sppp_get_ip_addrs(sp, &ouraddr, 0, 0); opt[i++] = IPCP_OPT_ADDRESS; opt[i++] = 6; opt[i++] = ouraddr >> 24; opt[i++] = ouraddr >> 16; opt[i++] = ouraddr >> 8; opt[i++] = ouraddr; } sp->confid[IDX_IPCP] = ++sp->pp_seq[IDX_IPCP]; sppp_cp_send(sp, PPP_IPCP, CONF_REQ, sp->confid[IDX_IPCP], i, &opt); } #else /* !INET */ static void sppp_ipcp_init(struct sppp *sp) { } static void sppp_ipcp_up(struct sppp *sp) { } static void sppp_ipcp_down(struct sppp *sp) { } static void sppp_ipcp_open(struct sppp *sp) { } static void sppp_ipcp_close(struct sppp *sp) { } static void sppp_ipcp_TO(void *cookie) { } static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { return (0); } static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipcp_tlu(struct sppp *sp) { } static void sppp_ipcp_tld(struct sppp *sp) { } static void sppp_ipcp_tls(struct sppp *sp) { } static void sppp_ipcp_tlf(struct sppp *sp) { } static void sppp_ipcp_scr(struct sppp *sp) { } #endif /* *--------------------------------------------------------------------------* * * * The IPv6CP implementation. * * * *--------------------------------------------------------------------------* */ #ifdef INET6 static void sppp_ipv6cp_init(struct sppp *sp) { sp->ipv6cp.opts = 0; sp->ipv6cp.flags = 0; sp->state[IDX_IPV6CP] = STATE_INITIAL; sp->fail_counter[IDX_IPV6CP] = 0; sp->pp_seq[IDX_IPV6CP] = 0; sp->pp_rseq[IDX_IPV6CP] = 0; callout_init(&sp->ch[IDX_IPV6CP], 1); } static void sppp_ipv6cp_up(struct sppp *sp) { sppp_up_event(&ipv6cp, sp); } static void sppp_ipv6cp_down(struct sppp *sp) { sppp_down_event(&ipv6cp, sp); } static void sppp_ipv6cp_open(struct sppp *sp) { STDDCL; struct in6_addr myaddr, hisaddr; #ifdef IPV6CP_MYIFID_DYN sp->ipv6cp.flags &= ~(IPV6CP_MYIFID_SEEN|IPV6CP_MYIFID_DYN); #else sp->ipv6cp.flags &= ~IPV6CP_MYIFID_SEEN; #endif sppp_get_ip6_addrs(sp, &myaddr, &hisaddr, 0); /* * If we don't have our address, this probably means our * interface doesn't want to talk IPv6 at all. (This could * be the case if somebody wants to speak only IPX, for * example.) Don't open IPv6CP in this case. */ if (IN6_IS_ADDR_UNSPECIFIED(&myaddr)) { /* XXX this message should go away */ if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp_open(): no IPv6 interface\n", SPP_ARGS(ifp)); return; } sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); sppp_open_event(&ipv6cp, sp); } static void sppp_ipv6cp_close(struct sppp *sp) { sppp_close_event(&ipv6cp, sp); } static void sppp_ipv6cp_TO(void *cookie) { sppp_to_event(&ipv6cp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *r, *p; struct ifnet *ifp = SP2IFP(sp); int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; struct in6_addr myaddr, desiredaddr, suggestaddr; int ifidcount; int type; int collision, nohisaddr; char ip6buf[INET6_ADDRSTRLEN]; len -= 4; origlen = len; /* * Make sure to allocate a buf that can at least hold a * conf-nak with an `address' option. We might need it below. */ buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); if (! buf) return (0); /* pass 1: see if we can recognize them */ if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp parse opts:", SPP_ARGS(ifp)); p = (void*) (h+1); ifidcount = 0; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: if (len >= 10 && p[1] == 10 && ifidcount == 0) { /* correctly formed address option */ ifidcount++; continue; } if (debug) log(-1, " [invalid]"); break; #ifdef notyet case IPV6CP_OPT_COMPRESSION: if (len >= 4 && p[1] >= 4) { /* correctly formed compress option */ continue; } if (debug) log(-1, " [invalid]"); break; #endif default: /* Others not supported. */ if (debug) log(-1, " [rej]"); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_IPV6CP, CONF_REJ, h->ident, rlen, buf); goto end; } else if (debug) log(-1, "\n"); /* pass 2: parse option values */ sppp_get_ip6_addrs(sp, &myaddr, 0, 0); if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; type = CONF_ACK; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { #ifdef notyet case IPV6CP_OPT_COMPRESSION: continue; #endif case IPV6CP_OPT_IFID: bzero(&desiredaddr, sizeof(desiredaddr)); bcopy(&p[2], &desiredaddr.s6_addr[8], 8); collision = (bcmp(&desiredaddr.s6_addr[8], &myaddr.s6_addr[8], 8) == 0); nohisaddr = IN6_IS_ADDR_UNSPECIFIED(&desiredaddr); desiredaddr.s6_addr16[0] = htons(0xfe80); (void)in6_setscope(&desiredaddr, SP2IFP(sp), NULL); if (!collision && !nohisaddr) { /* no collision, hisaddr known - Conf-Ack */ type = CONF_ACK; if (debug) { log(-1, " %s [%s]", ip6_sprintf(ip6buf, &desiredaddr), sppp_cp_type_name(type)); } continue; } bzero(&suggestaddr, sizeof(suggestaddr)); if (collision && nohisaddr) { /* collision, hisaddr unknown - Conf-Rej */ type = CONF_REJ; bzero(&p[2], 8); } else { /* * - no collision, hisaddr unknown, or * - collision, hisaddr known * Conf-Nak, suggest hisaddr */ type = CONF_NAK; sppp_suggest_ip6_addr(sp, &suggestaddr); bcopy(&suggestaddr.s6_addr[8], &p[2], 8); } if (debug) log(-1, " %s [%s]", ip6_sprintf(ip6buf, &desiredaddr), sppp_cp_type_name(type)); break; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen == 0 && type == CONF_ACK) { if (debug) log(-1, " send %s\n", sppp_cp_type_name(type)); sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, origlen, h+1); } else { #ifdef DIAGNOSTIC if (type == CONF_ACK) panic("IPv6CP RCR: CONF_ACK with non-zero rlen"); #endif if (debug) { log(-1, " send %s suggest %s\n", sppp_cp_type_name(type), ip6_sprintf(ip6buf, &suggestaddr)); } sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, rlen, buf); } end: free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the IPv6CP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp rej opts:", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: /* * Peer doesn't grok address option. This is * bad. XXX Should we better give up here? */ sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_IFID); break; #ifdef notyet case IPV6CP_OPT_COMPRESS: sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_COMPRESS); break; #endif } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the IPv6CP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; struct in6_addr suggestaddr; char ip6buf[INET6_ADDRSTRLEN]; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp nak opts:", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: /* * Peer doesn't like our local ifid. See * if we can do something for him. We'll drop * him our address then. */ if (len < 10 || p[1] != 10) break; bzero(&suggestaddr, sizeof(suggestaddr)); suggestaddr.s6_addr16[0] = htons(0xfe80); (void)in6_setscope(&suggestaddr, SP2IFP(sp), NULL); bcopy(&p[2], &suggestaddr.s6_addr[8], 8); sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); if (debug) log(-1, " [suggestaddr %s]", ip6_sprintf(ip6buf, &suggestaddr)); #ifdef IPV6CP_MYIFID_DYN /* * When doing dynamic address assignment, * we accept his offer. */ if (sp->ipv6cp.flags & IPV6CP_MYIFID_DYN) { struct in6_addr lastsuggest; /* * If equals to * , * we have a collision. generate new random * ifid. */ sppp_suggest_ip6_addr(&lastsuggest); if (IN6_ARE_ADDR_EQUAL(&suggestaddr, lastsuggest)) { if (debug) log(-1, " [random]"); sppp_gen_ip6_addr(sp, &suggestaddr); } sppp_set_ip6_addr(sp, &suggestaddr, 0); if (debug) log(-1, " [agree]"); sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; } #else /* * Since we do not do dynamic address assignment, * we ignore it and thus continue to negotiate * our already existing value. This can possibly * go into infinite request-reject loop. * * This is not likely because we normally use * ifid based on MAC-address. * If you have no ethernet card on the node, too bad. * XXX should we use fail_counter? */ #endif break; #ifdef notyet case IPV6CP_OPT_COMPRESS: /* * Peer wants different compression parameters. */ break; #endif } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_ipv6cp_tlu(struct sppp *sp) { /* we are up - notify isdn daemon */ if (sp->pp_con) sp->pp_con(sp); } static void sppp_ipv6cp_tld(struct sppp *sp) { } static void sppp_ipv6cp_tls(struct sppp *sp) { /* indicate to LCP that it must stay alive */ sp->lcp.protos |= (1 << IDX_IPV6CP); } static void sppp_ipv6cp_tlf(struct sppp *sp) { #if 0 /* need #if 0 to close IPv6CP properly */ /* we no longer need LCP */ sp->lcp.protos &= ~(1 << IDX_IPV6CP); sppp_lcp_check_and_close(sp); #endif } static void sppp_ipv6cp_scr(struct sppp *sp) { char opt[10 /* ifid */ + 4 /* compression, minimum */]; struct in6_addr ouraddr; int i = 0; if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_IFID)) { sppp_get_ip6_addrs(sp, &ouraddr, 0, 0); opt[i++] = IPV6CP_OPT_IFID; opt[i++] = 10; bcopy(&ouraddr.s6_addr[8], &opt[i], 8); i += 8; } #ifdef notyet if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_COMPRESSION)) { opt[i++] = IPV6CP_OPT_COMPRESSION; opt[i++] = 4; opt[i++] = 0; /* TBD */ opt[i++] = 0; /* TBD */ /* variable length data may follow */ } #endif sp->confid[IDX_IPV6CP] = ++sp->pp_seq[IDX_IPV6CP]; sppp_cp_send(sp, PPP_IPV6CP, CONF_REQ, sp->confid[IDX_IPV6CP], i, &opt); } #else /*INET6*/ static void sppp_ipv6cp_init(struct sppp *sp) { } static void sppp_ipv6cp_up(struct sppp *sp) { } static void sppp_ipv6cp_down(struct sppp *sp) { } static void sppp_ipv6cp_open(struct sppp *sp) { } static void sppp_ipv6cp_close(struct sppp *sp) { } static void sppp_ipv6cp_TO(void *sp) { } static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) { return 0; } static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipv6cp_tlu(struct sppp *sp) { } static void sppp_ipv6cp_tld(struct sppp *sp) { } static void sppp_ipv6cp_tls(struct sppp *sp) { } static void sppp_ipv6cp_tlf(struct sppp *sp) { } static void sppp_ipv6cp_scr(struct sppp *sp) { } #endif /*INET6*/ /* *--------------------------------------------------------------------------* * * * The CHAP implementation. * * * *--------------------------------------------------------------------------* */ /* * The authentication protocols don't employ a full-fledged state machine as * the control protocols do, since they do have Open and Close events, but * not Up and Down, nor are they explicitly terminated. Also, use of the * authentication protocols may be different in both directions (this makes * sense, think of a machine that never accepts incoming calls but only * calls out, it doesn't require the called party to authenticate itself). * * Our state machine for the local authentication protocol (we are requesting * the peer to authenticate) looks like: * * RCA- * +--------------------------------------------+ * V scn,tld| * +--------+ Close +---------+ RCA+ * | |<----------------------------------| |------+ * +--->| Closed | TO* | Opened | sca | * | | |-----+ +-------| |<-----+ * | +--------+ irc | | +---------+ * | ^ | | ^ * | | | | | * | | | | | * | TO-| | | | * | |tld TO+ V | | * | | +------->+ | | * | | | | | | * | +--------+ V | | * | | |<----+<--------------------+ | * | | Req- | scr | * | | Sent | | * | | | | * | +--------+ | * | RCA- | | RCA+ | * +------+ +------------------------------------------+ * scn,tld sca,irc,ict,tlu * * * with: * * Open: LCP reached authentication phase * Close: LCP reached terminate phase * * RCA+: received reply (pap-req, chap-response), acceptable * RCN: received reply (pap-req, chap-response), not acceptable * TO+: timeout with restart counter >= 0 * TO-: timeout with restart counter < 0 * TO*: reschedule timeout for CHAP * * scr: send request packet (none for PAP, chap-challenge) * sca: send ack packet (pap-ack, chap-success) * scn: send nak packet (pap-nak, chap-failure) * ict: initialize re-challenge timer (CHAP only) * * tlu: this-layer-up, LCP reaches network phase * tld: this-layer-down, LCP enters terminate phase * * Note that in CHAP mode, after sending a new challenge, while the state * automaton falls back into Req-Sent state, it doesn't signal a tld * event to LCP, so LCP remains in network phase. Only after not getting * any response (or after getting an unacceptable response), CHAP closes, * causing LCP to enter terminate phase. * * With PAP, there is no initial request that can be sent. The peer is * expected to send one based on the successful negotiation of PAP as * the authentication protocol during the LCP option negotiation. * * Incoming authentication protocol requests (remote requests * authentication, we are peer) don't employ a state machine at all, * they are simply answered. Some peers [Ascend P50 firmware rev * 4.50] react allergically when sending IPCP requests while they are * still in authentication phase (thereby violating the standard that * demands that these NCP packets are to be discarded), so we keep * track of the peer demanding us to authenticate, and only proceed to * phase network once we've seen a positive acknowledge for the * authentication. */ /* * Handle incoming CHAP packets. */ static void sppp_chap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len; u_char *value, *name, digest[AUTHKEYLEN], dsize; int value_len, name_len; MD5_CTX ctx; len = m->m_pkthdr.len; if (len < 4) { if (debug) log(LOG_DEBUG, SPP_FMT "chap invalid packet length: %d bytes\n", SPP_ARGS(ifp), len); return; } h = mtod (m, struct lcp_header*); if (len > ntohs (h->len)) len = ntohs (h->len); switch (h->type) { /* challenge, failure and success are his authproto */ case CHAP_CHALLENGE: value = 1 + (u_char*)(h+1); value_len = value[-1]; name = value + value_len; name_len = len - value_len - 5; if (name_len < 0) { if (debug) { log(LOG_DEBUG, SPP_FMT "chap corrupted challenge " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*) (h+1), len-4); log(-1, ">\n"); } break; } if (debug) { log(LOG_DEBUG, SPP_FMT "chap input <%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_string((char*) name, name_len); log(-1, " value-size=%d value=", value_len); sppp_print_bytes(value, value_len); log(-1, ">\n"); } /* Compute reply value. */ MD5Init(&ctx); MD5Update(&ctx, &h->ident, 1); MD5Update(&ctx, sp->myauth.secret, sppp_strnlen(sp->myauth.secret, AUTHKEYLEN)); MD5Update(&ctx, value, value_len); MD5Final(digest, &ctx); dsize = sizeof digest; sppp_auth_send(&chap, sp, CHAP_RESPONSE, h->ident, sizeof dsize, (const char *)&dsize, sizeof digest, digest, (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), sp->myauth.name, 0); break; case CHAP_SUCCESS: if (debug) { log(LOG_DEBUG, SPP_FMT "chap success", SPP_ARGS(ifp)); if (len > 4) { log(-1, ": "); sppp_print_string((char*)(h + 1), len - 4); } log(-1, "\n"); } SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_CHAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && (sp->lcp.protos & (1 << IDX_CHAP)) == 0) { /* * We are authenticator for CHAP but didn't * complete yet. Leave it to tlu to proceed * to network phase. */ SPPP_UNLOCK(sp); break; } SPPP_UNLOCK(sp); sppp_phase_network(sp); break; case CHAP_FAILURE: if (debug) { log(LOG_INFO, SPP_FMT "chap failure", SPP_ARGS(ifp)); if (len > 4) { log(-1, ": "); sppp_print_string((char*)(h + 1), len - 4); } log(-1, "\n"); } else log(LOG_INFO, SPP_FMT "chap failure\n", SPP_ARGS(ifp)); /* await LCP shutdown by authenticator */ break; /* response is my authproto */ case CHAP_RESPONSE: value = 1 + (u_char*)(h+1); value_len = value[-1]; name = value + value_len; name_len = len - value_len - 5; if (name_len < 0) { if (debug) { log(LOG_DEBUG, SPP_FMT "chap corrupted response " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } if (h->ident != sp->confid[IDX_CHAP]) { if (debug) log(LOG_DEBUG, SPP_FMT "chap dropping response for old ID " "(got %d, expected %d)\n", SPP_ARGS(ifp), h->ident, sp->confid[IDX_CHAP]); break; } if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) || bcmp(name, sp->hisauth.name, name_len) != 0) { log(LOG_INFO, SPP_FMT "chap response, his name ", SPP_ARGS(ifp)); sppp_print_string(name, name_len); log(-1, " != expected "); sppp_print_string(sp->hisauth.name, sppp_strnlen(sp->hisauth.name, AUTHNAMELEN)); log(-1, "\n"); } if (debug) { log(LOG_DEBUG, SPP_FMT "chap input(%s) " "<%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs (h->len)); sppp_print_string((char*)name, name_len); log(-1, " value-size=%d value=", value_len); sppp_print_bytes(value, value_len); log(-1, ">\n"); } if (value_len != AUTHKEYLEN) { if (debug) log(LOG_DEBUG, SPP_FMT "chap bad hash value length: " "%d bytes, should be %d\n", SPP_ARGS(ifp), value_len, AUTHKEYLEN); break; } MD5Init(&ctx); MD5Update(&ctx, &h->ident, 1); MD5Update(&ctx, sp->hisauth.secret, sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN)); MD5Update(&ctx, sp->myauth.challenge, AUTHKEYLEN); MD5Final(digest, &ctx); #define FAILMSG "Failed..." #define SUCCMSG "Welcome!" if (value_len != sizeof digest || bcmp(digest, value, value_len) != 0) { /* action scn, tld */ sppp_auth_send(&chap, sp, CHAP_FAILURE, h->ident, sizeof(FAILMSG) - 1, (u_char *)FAILMSG, 0); chap.tld(sp); break; } /* action sca, perhaps tlu */ if (sp->state[IDX_CHAP] == STATE_REQ_SENT || sp->state[IDX_CHAP] == STATE_OPENED) sppp_auth_send(&chap, sp, CHAP_SUCCESS, h->ident, sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, 0); if (sp->state[IDX_CHAP] == STATE_REQ_SENT) { sppp_cp_change_state(&chap, sp, STATE_OPENED); chap.tlu(sp); } break; default: /* Unknown CHAP packet type -- ignore. */ if (debug) { log(LOG_DEBUG, SPP_FMT "chap unknown input(%s) " "<0x%x id=0x%xh len=%d", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), h->type, h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } } static void sppp_chap_init(struct sppp *sp) { /* Chap doesn't have STATE_INITIAL at all. */ sp->state[IDX_CHAP] = STATE_CLOSED; sp->fail_counter[IDX_CHAP] = 0; sp->pp_seq[IDX_CHAP] = 0; sp->pp_rseq[IDX_CHAP] = 0; callout_init(&sp->ch[IDX_CHAP], 1); } static void sppp_chap_open(struct sppp *sp) { if (sp->myauth.proto == PPP_CHAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { /* we are authenticator for CHAP, start it */ chap.scr(sp); sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); } /* nothing to be done if we are peer, await a challenge */ } static void sppp_chap_close(struct sppp *sp) { if (sp->state[IDX_CHAP] != STATE_CLOSED) sppp_cp_change_state(&chap, sp, STATE_CLOSED); } static void sppp_chap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "chap TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), sp->rst_counter[IDX_CHAP]); if (--sp->rst_counter[IDX_CHAP] < 0) /* TO- event */ switch (sp->state[IDX_CHAP]) { case STATE_REQ_SENT: chap.tld(sp); sppp_cp_change_state(&chap, sp, STATE_CLOSED); break; } else /* TO+ (or TO*) event */ switch (sp->state[IDX_CHAP]) { case STATE_OPENED: /* TO* event */ sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; /* FALLTHROUGH */ case STATE_REQ_SENT: chap.scr(sp); /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); break; } SPPP_UNLOCK(sp); } static void sppp_chap_tlu(struct sppp *sp) { STDDCL; int i; i = 0; sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; /* * Some broken CHAP implementations (Conware CoNet, firmware * 4.0.?) don't want to re-authenticate their CHAP once the * initial challenge-response exchange has taken place. * Provide for an option to avoid rechallenges. */ if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) { /* * Compute the re-challenge timeout. This will yield * a number between 300 and 810 seconds. */ i = 300 + ((unsigned)(random() & 0xff00) >> 7); callout_reset(&sp->ch[IDX_CHAP], i * hz, chap.TO, (void *)sp); } if (debug) { log(LOG_DEBUG, SPP_FMT "chap %s, ", SPP_ARGS(ifp), sp->pp_phase == PHASE_NETWORK? "reconfirmed": "tlu"); if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) log(-1, "next re-challenge in %d seconds\n", i); else log(-1, "re-challenging supressed\n"); } SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_CHAP); if (sp->pp_flags & PP_NEEDAUTH) { /* * Remote is authenticator, but his auth proto didn't * complete yet. Defer the transition to network * phase. */ SPPP_UNLOCK(sp); return; } SPPP_UNLOCK(sp); /* * If we are already in phase network, we are done here. This * is the case if this is a dummy tlu event after a re-challenge. */ if (sp->pp_phase != PHASE_NETWORK) sppp_phase_network(sp); } static void sppp_chap_tld(struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp)); callout_stop(&sp->ch[IDX_CHAP]); sp->lcp.protos &= ~(1 << IDX_CHAP); lcp.Close(sp); } static void sppp_chap_scr(struct sppp *sp) { u_long *ch, seed; u_char clen; /* Compute random challenge. */ ch = (u_long *)sp->myauth.challenge; read_random(&seed, sizeof seed); ch[0] = seed ^ random(); ch[1] = seed ^ random(); ch[2] = seed ^ random(); ch[3] = seed ^ random(); clen = AUTHKEYLEN; sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP]; sppp_auth_send(&chap, sp, CHAP_CHALLENGE, sp->confid[IDX_CHAP], sizeof clen, (const char *)&clen, (size_t)AUTHKEYLEN, sp->myauth.challenge, (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), sp->myauth.name, 0); } /* *--------------------------------------------------------------------------* * * * The PAP implementation. * * * *--------------------------------------------------------------------------* */ /* * For PAP, we need to keep a little state also if we are the peer, not the * authenticator. This is since we don't get a request to authenticate, but * have to repeatedly authenticate ourself until we got a response (or the * retry counter is expired). */ /* * Handle incoming PAP packets. */ static void sppp_pap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len; u_char *name, *passwd, mlen; int name_len, passwd_len; len = m->m_pkthdr.len; if (len < 5) { if (debug) log(LOG_DEBUG, SPP_FMT "pap invalid packet length: %d bytes\n", SPP_ARGS(ifp), len); return; } h = mtod (m, struct lcp_header*); if (len > ntohs (h->len)) len = ntohs (h->len); switch (h->type) { /* PAP request is my authproto */ case PAP_REQ: name = 1 + (u_char*)(h+1); name_len = name[-1]; passwd = name + name_len + 1; if (name_len > len - 6 || (passwd_len = passwd[-1]) > len - 6 - name_len) { if (debug) { log(LOG_DEBUG, SPP_FMT "pap corrupted input " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_PAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } if (debug) { log(LOG_DEBUG, SPP_FMT "pap input(%s) " "<%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_PAP]), sppp_auth_type_name(PPP_PAP, h->type), h->ident, ntohs(h->len)); sppp_print_string((char*)name, name_len); log(-1, " passwd="); sppp_print_string((char*)passwd, passwd_len); log(-1, ">\n"); } if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) || passwd_len != sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN) || bcmp(name, sp->hisauth.name, name_len) != 0 || bcmp(passwd, sp->hisauth.secret, passwd_len) != 0) { /* action scn, tld */ mlen = sizeof(FAILMSG) - 1; sppp_auth_send(&pap, sp, PAP_NAK, h->ident, sizeof mlen, (const char *)&mlen, sizeof(FAILMSG) - 1, (u_char *)FAILMSG, 0); pap.tld(sp); break; } /* action sca, perhaps tlu */ if (sp->state[IDX_PAP] == STATE_REQ_SENT || sp->state[IDX_PAP] == STATE_OPENED) { mlen = sizeof(SUCCMSG) - 1; sppp_auth_send(&pap, sp, PAP_ACK, h->ident, sizeof mlen, (const char *)&mlen, sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, 0); } if (sp->state[IDX_PAP] == STATE_REQ_SENT) { sppp_cp_change_state(&pap, sp, STATE_OPENED); pap.tlu(sp); } break; /* ack and nak are his authproto */ case PAP_ACK: callout_stop(&sp->pap_my_to_ch); if (debug) { log(LOG_DEBUG, SPP_FMT "pap success", SPP_ARGS(ifp)); name_len = *((char *)h); if (len > 5 && name_len) { log(-1, ": "); sppp_print_string((char*)(h+1), name_len); } log(-1, "\n"); } SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_PAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && (sp->lcp.protos & (1 << IDX_PAP)) == 0) { /* * We are authenticator for PAP but didn't * complete yet. Leave it to tlu to proceed * to network phase. */ SPPP_UNLOCK(sp); break; } SPPP_UNLOCK(sp); sppp_phase_network(sp); break; case PAP_NAK: callout_stop (&sp->pap_my_to_ch); if (debug) { log(LOG_INFO, SPP_FMT "pap failure", SPP_ARGS(ifp)); name_len = *((char *)h); if (len > 5 && name_len) { log(-1, ": "); sppp_print_string((char*)(h+1), name_len); } log(-1, "\n"); } else log(LOG_INFO, SPP_FMT "pap failure\n", SPP_ARGS(ifp)); /* await LCP shutdown by authenticator */ break; default: /* Unknown PAP packet type -- ignore. */ if (debug) { log(LOG_DEBUG, SPP_FMT "pap corrupted input " "<0x%x id=0x%x len=%d", SPP_ARGS(ifp), h->type, h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } } static void sppp_pap_init(struct sppp *sp) { /* PAP doesn't have STATE_INITIAL at all. */ sp->state[IDX_PAP] = STATE_CLOSED; sp->fail_counter[IDX_PAP] = 0; sp->pp_seq[IDX_PAP] = 0; sp->pp_rseq[IDX_PAP] = 0; callout_init(&sp->ch[IDX_PAP], 1); callout_init(&sp->pap_my_to_ch, 1); } static void sppp_pap_open(struct sppp *sp) { if (sp->hisauth.proto == PPP_PAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { /* we are authenticator for PAP, start our timer */ sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); } if (sp->myauth.proto == PPP_PAP) { /* we are peer, send a request, and start a timer */ pap.scr(sp); callout_reset(&sp->pap_my_to_ch, sp->lcp.timeout, sppp_pap_my_TO, (void *)sp); } } static void sppp_pap_close(struct sppp *sp) { if (sp->state[IDX_PAP] != STATE_CLOSED) sppp_cp_change_state(&pap, sp, STATE_CLOSED); } /* * That's the timeout routine if we are authenticator. Since the * authenticator is basically passive in PAP, we can't do much here. */ static void sppp_pap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "pap TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_PAP]), sp->rst_counter[IDX_PAP]); if (--sp->rst_counter[IDX_PAP] < 0) /* TO- event */ switch (sp->state[IDX_PAP]) { case STATE_REQ_SENT: pap.tld(sp); sppp_cp_change_state(&pap, sp, STATE_CLOSED); break; } else /* TO+ event, not very much we could do */ switch (sp->state[IDX_PAP]) { case STATE_REQ_SENT: /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); break; } SPPP_UNLOCK(sp); } /* * That's the timeout handler if we are peer. Since the peer is active, * we need to retransmit our PAP request since it is apparently lost. * XXX We should impose a max counter. */ static void sppp_pap_my_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "pap peer TO\n", SPP_ARGS(ifp)); SPPP_LOCK(sp); pap.scr(sp); SPPP_UNLOCK(sp); } static void sppp_pap_tlu(struct sppp *sp) { STDDCL; sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), pap.name); SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_PAP); if (sp->pp_flags & PP_NEEDAUTH) { /* * Remote is authenticator, but his auth proto didn't * complete yet. Defer the transition to network * phase. */ SPPP_UNLOCK(sp); return; } SPPP_UNLOCK(sp); sppp_phase_network(sp); } static void sppp_pap_tld(struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp)); callout_stop (&sp->ch[IDX_PAP]); callout_stop (&sp->pap_my_to_ch); sp->lcp.protos &= ~(1 << IDX_PAP); lcp.Close(sp); } static void sppp_pap_scr(struct sppp *sp) { u_char idlen, pwdlen; sp->confid[IDX_PAP] = ++sp->pp_seq[IDX_PAP]; pwdlen = sppp_strnlen(sp->myauth.secret, AUTHKEYLEN); idlen = sppp_strnlen(sp->myauth.name, AUTHNAMELEN); sppp_auth_send(&pap, sp, PAP_REQ, sp->confid[IDX_PAP], sizeof idlen, (const char *)&idlen, (size_t)idlen, sp->myauth.name, sizeof pwdlen, (const char *)&pwdlen, (size_t)pwdlen, sp->myauth.secret, 0); } /* * Random miscellaneous functions. */ /* * Send a PAP or CHAP proto packet. * * Varadic function, each of the elements for the ellipsis is of type * ``size_t mlen, const u_char *msg''. Processing will stop iff * mlen == 0. * NOTE: never declare variadic functions with types subject to type * promotion (i.e. u_char). This is asking for big trouble depending * on the architecture you are on... */ static void sppp_auth_send(const struct cp *cp, struct sppp *sp, unsigned int type, unsigned int id, ...) { STDDCL; struct ppp_header *h; struct lcp_header *lh; struct mbuf *m; u_char *p; int len; unsigned int mlen; const char *msg; va_list ap; MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ h->protocol = htons(cp->proto); lh = (struct lcp_header*)(h + 1); lh->type = type; lh->ident = id; p = (u_char*) (lh+1); va_start(ap, id); len = 0; while ((mlen = (unsigned int)va_arg(ap, size_t)) != 0) { msg = va_arg(ap, const char *); len += mlen; if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) { va_end(ap); m_freem(m); return; } bcopy(msg, p, mlen); p += mlen; } va_end(ap); m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; lh->len = htons (LCP_HEADER_LEN + len); if (debug) { log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", SPP_ARGS(ifp), cp->name, sppp_auth_type_name(cp->proto, lh->type), lh->ident, ntohs(lh->len)); sppp_print_bytes((u_char*) (lh+1), len); log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Flush interface queue. */ static void sppp_qflush(struct ifqueue *ifq) { struct mbuf *m, *n; n = ifq->ifq_head; while ((m = n)) { n = m->m_nextpkt; m_freem (m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; } /* * Send keepalive packets, every 10 seconds. */ static void sppp_keepalive(void *dummy) { struct sppp *sp = (struct sppp*)dummy; struct ifnet *ifp = SP2IFP(sp); SPPP_LOCK(sp); /* Keepalive mode disabled or channel down? */ if (! (sp->pp_flags & PP_KEEPALIVE) || ! (ifp->if_drv_flags & IFF_DRV_RUNNING)) goto out; if (sp->pp_mode == PP_FR) { sppp_fr_keepalive (sp); goto out; } /* No keepalive in PPP mode if LCP not opened yet. */ if (sp->pp_mode != IFF_CISCO && sp->pp_phase < PHASE_AUTHENTICATE) goto out; if (sp->pp_alivecnt == MAXALIVECNT) { /* No keepalive packets got. Stop the interface. */ printf (SPP_FMT "down\n", SPP_ARGS(ifp)); if_down (ifp); sppp_qflush (&sp->pp_cpq); if (sp->pp_mode != IFF_CISCO) { /* XXX */ /* Shut down the PPP link. */ lcp.Down(sp); /* Initiate negotiation. XXX */ lcp.Up(sp); } } if (sp->pp_alivecnt <= MAXALIVECNT) ++sp->pp_alivecnt; if (sp->pp_mode == IFF_CISCO) sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ, ++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]); else if (sp->pp_phase >= PHASE_AUTHENTICATE) { long nmagic = htonl (sp->lcp.magic); sp->lcp.echoid = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, ECHO_REQ, sp->lcp.echoid, 4, &nmagic); } out: SPPP_UNLOCK(sp); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); } /* * Get both IP addresses. */ void sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, u_long *srcmask) { struct ifnet *ifp = SP2IFP(sp); struct ifaddr *ifa; struct sockaddr_in *si, *sm; u_long ssrc, ddst; sm = NULL; ssrc = ddst = 0L; /* * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ si = 0; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { si = (struct sockaddr_in *)ifa->ifa_addr; sm = (struct sockaddr_in *)ifa->ifa_netmask; if (si) break; } if (ifa) { if (si && si->sin_addr.s_addr) { ssrc = si->sin_addr.s_addr; if (srcmask) *srcmask = ntohl(sm->sin_addr.s_addr); } si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) ddst = si->sin_addr.s_addr; } if_addr_runlock(ifp); if (dst) *dst = ntohl(ddst); if (src) *src = ntohl(ssrc); } #ifdef INET /* * Set my IP address. */ static void sppp_set_ip_addr(struct sppp *sp, u_long src) { STDDCL; struct ifaddr *ifa; struct sockaddr_in *si; struct in_ifaddr *ia; /* * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ si = 0; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { si = (struct sockaddr_in *)ifa->ifa_addr; if (si != NULL) { ifa_ref(ifa); break; } } } if_addr_runlock(ifp); if (ifa != NULL) { int error; /* delete old route */ error = rtinit(ifa, (int)RTM_DELETE, RTF_HOST); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit DEL failed, error=%d\n", SPP_ARGS(ifp), error); } /* set new address */ si->sin_addr.s_addr = htonl(src); ia = ifatoia(ifa); IN_IFADDR_WLOCK(); LIST_REMOVE(ia, ia_hash); LIST_INSERT_HEAD(INADDR_HASH(si->sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* add new route */ error = rtinit(ifa, (int)RTM_ADD, RTF_HOST); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit ADD failed, error=%d", SPP_ARGS(ifp), error); } ifa_free(ifa); } } #endif #ifdef INET6 /* * Get both IPv6 addresses. */ static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, struct in6_addr *dst, struct in6_addr *srcmask) { struct ifnet *ifp = SP2IFP(sp); struct ifaddr *ifa; struct sockaddr_in6 *si, *sm; struct in6_addr ssrc, ddst; sm = NULL; bzero(&ssrc, sizeof(ssrc)); bzero(&ddst, sizeof(ddst)); /* * Pick the first link-local AF_INET6 address from the list, * aliases don't make any sense on a p2p link anyway. */ si = NULL; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET6) { si = (struct sockaddr_in6 *)ifa->ifa_addr; sm = (struct sockaddr_in6 *)ifa->ifa_netmask; if (si && IN6_IS_ADDR_LINKLOCAL(&si->sin6_addr)) break; } if (ifa) { if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) { bcopy(&si->sin6_addr, &ssrc, sizeof(ssrc)); if (srcmask) { bcopy(&sm->sin6_addr, srcmask, sizeof(*srcmask)); } } si = (struct sockaddr_in6 *)ifa->ifa_dstaddr; if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) bcopy(&si->sin6_addr, &ddst, sizeof(ddst)); } if (dst) bcopy(&ddst, dst, sizeof(*dst)); if (src) bcopy(&ssrc, src, sizeof(*src)); if_addr_runlock(ifp); } #ifdef IPV6CP_MYIFID_DYN /* * Generate random ifid. */ static void sppp_gen_ip6_addr(struct sppp *sp, struct in6_addr *addr) { /* TBD */ } /* * Set my IPv6 address. */ static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src) { STDDCL; struct ifaddr *ifa; struct sockaddr_in6 *sin6; /* * Pick the first link-local AF_INET6 address from the list, * aliases don't make any sense on a p2p link anyway. */ sin6 = NULL; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (sin6 && IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { ifa_ref(ifa); break; } } } if_addr_runlock(ifp); if (ifa != NULL) { int error; struct sockaddr_in6 new_sin6 = *sin6; bcopy(src, &new_sin6.sin6_addr, sizeof(new_sin6.sin6_addr)); error = in6_ifinit(ifp, ifatoia6(ifa), &new_sin6, 1); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip6_addr: in6_ifinit " " failed, error=%d\n", SPP_ARGS(ifp), error); } ifa_free(ifa); } } #endif /* * Suggest a candidate address to be used by peer. */ static void sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *suggest) { struct in6_addr myaddr; struct timeval tv; sppp_get_ip6_addrs(sp, &myaddr, 0, 0); myaddr.s6_addr[8] &= ~0x02; /* u bit to "local" */ microtime(&tv); if ((tv.tv_usec & 0xff) == 0 && (tv.tv_sec & 0xff) == 0) { myaddr.s6_addr[14] ^= 0xff; myaddr.s6_addr[15] ^= 0xff; } else { myaddr.s6_addr[14] ^= (tv.tv_usec & 0xff); myaddr.s6_addr[15] ^= (tv.tv_sec & 0xff); } if (suggest) bcopy(&myaddr, suggest, sizeof(myaddr)); } #endif /*INET6*/ static int sppp_params(struct sppp *sp, u_long cmd, void *data) { u_long subcmd; struct ifreq *ifr = (struct ifreq *)data; struct spppreq *spr; int rv = 0; if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == 0) return (EAGAIN); /* * ifr->ifr_data is supposed to point to a struct spppreq. * Check the cmd word first before attempting to fetch all the * data. */ rv = fueword(ifr->ifr_data, &subcmd); if (rv == -1) { rv = EFAULT; goto quit; } if (copyin((caddr_t)ifr->ifr_data, spr, sizeof(struct spppreq)) != 0) { rv = EFAULT; goto quit; } switch (subcmd) { case (u_long)SPPPIOGDEFS: if (cmd != SIOCGIFGENERIC) { rv = EINVAL; break; } /* * We copy over the entire current state, but clean * out some of the stuff we don't wanna pass up. * Remember, SIOCGIFGENERIC is unprotected, and can be * called by any user. No need to ever get PAP or * CHAP secrets back to userland anyway. */ spr->defs.pp_phase = sp->pp_phase; spr->defs.enable_vj = (sp->confflags & CONF_ENABLE_VJ) != 0; spr->defs.enable_ipv6 = (sp->confflags & CONF_ENABLE_IPV6) != 0; spr->defs.lcp = sp->lcp; spr->defs.ipcp = sp->ipcp; spr->defs.ipv6cp = sp->ipv6cp; spr->defs.myauth = sp->myauth; spr->defs.hisauth = sp->hisauth; bzero(spr->defs.myauth.secret, AUTHKEYLEN); bzero(spr->defs.myauth.challenge, AUTHKEYLEN); bzero(spr->defs.hisauth.secret, AUTHKEYLEN); bzero(spr->defs.hisauth.challenge, AUTHKEYLEN); /* * Fixup the LCP timeout value to milliseconds so * spppcontrol doesn't need to bother about the value * of "hz". We do the reverse calculation below when * setting it. */ spr->defs.lcp.timeout = sp->lcp.timeout * 1000 / hz; rv = copyout(spr, (caddr_t)ifr->ifr_data, sizeof(struct spppreq)); break; case (u_long)SPPPIOSDEFS: if (cmd != SIOCSIFGENERIC) { rv = EINVAL; break; } /* * We have a very specific idea of which fields we * allow being passed back from userland, so to not * clobber our current state. For one, we only allow * setting anything if LCP is in dead or establish * phase. Once the authentication negotiations * started, the authentication settings must not be * changed again. (The administrator can force an * ifconfig down in order to get LCP back into dead * phase.) * * Also, we only allow for authentication parameters to be * specified. * * XXX Should allow to set or clear pp_flags. * * Finally, if the respective authentication protocol to * be used is set differently than 0, but the secret is * passed as all zeros, we don't trash the existing secret. * This allows an administrator to change the system name * only without clobbering the secret (which he didn't get * back in a previous SPPPIOGDEFS call). However, the * secrets are cleared if the authentication protocol is * reset to 0. */ if (sp->pp_phase != PHASE_DEAD && sp->pp_phase != PHASE_ESTABLISH) { rv = EBUSY; break; } if ((spr->defs.myauth.proto != 0 && spr->defs.myauth.proto != PPP_PAP && spr->defs.myauth.proto != PPP_CHAP) || (spr->defs.hisauth.proto != 0 && spr->defs.hisauth.proto != PPP_PAP && spr->defs.hisauth.proto != PPP_CHAP)) { rv = EINVAL; break; } if (spr->defs.myauth.proto == 0) /* resetting myauth */ bzero(&sp->myauth, sizeof sp->myauth); else { /* setting/changing myauth */ sp->myauth.proto = spr->defs.myauth.proto; bcopy(spr->defs.myauth.name, sp->myauth.name, AUTHNAMELEN); if (spr->defs.myauth.secret[0] != '\0') bcopy(spr->defs.myauth.secret, sp->myauth.secret, AUTHKEYLEN); } if (spr->defs.hisauth.proto == 0) /* resetting hisauth */ bzero(&sp->hisauth, sizeof sp->hisauth); else { /* setting/changing hisauth */ sp->hisauth.proto = spr->defs.hisauth.proto; sp->hisauth.flags = spr->defs.hisauth.flags; bcopy(spr->defs.hisauth.name, sp->hisauth.name, AUTHNAMELEN); if (spr->defs.hisauth.secret[0] != '\0') bcopy(spr->defs.hisauth.secret, sp->hisauth.secret, AUTHKEYLEN); } /* set LCP restart timer timeout */ if (spr->defs.lcp.timeout != 0) sp->lcp.timeout = spr->defs.lcp.timeout * hz / 1000; /* set VJ enable and IPv6 disable flags */ #ifdef INET if (spr->defs.enable_vj) sp->confflags |= CONF_ENABLE_VJ; else sp->confflags &= ~CONF_ENABLE_VJ; #endif #ifdef INET6 if (spr->defs.enable_ipv6) sp->confflags |= CONF_ENABLE_IPV6; else sp->confflags &= ~CONF_ENABLE_IPV6; #endif break; default: rv = EINVAL; } quit: free(spr, M_TEMP); return (rv); } static void sppp_phase_network(struct sppp *sp) { STDDCL; int i; u_long mask; sp->pp_phase = PHASE_NETWORK; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify NCPs now. */ for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_NCP) (cps[i])->Open(sp); /* Send Up events to all NCPs. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_NCP)) (cps[i])->Up(sp); /* if no NCP is starting, all this was in vain, close down */ sppp_lcp_check_and_close(sp); } static const char * sppp_cp_type_name(u_char type) { static char buf[12]; switch (type) { case CONF_REQ: return "conf-req"; case CONF_ACK: return "conf-ack"; case CONF_NAK: return "conf-nak"; case CONF_REJ: return "conf-rej"; case TERM_REQ: return "term-req"; case TERM_ACK: return "term-ack"; case CODE_REJ: return "code-rej"; case PROTO_REJ: return "proto-rej"; case ECHO_REQ: return "echo-req"; case ECHO_REPLY: return "echo-reply"; case DISC_REQ: return "discard-req"; } snprintf (buf, sizeof(buf), "cp/0x%x", type); return buf; } static const char * sppp_auth_type_name(u_short proto, u_char type) { static char buf[12]; switch (proto) { case PPP_CHAP: switch (type) { case CHAP_CHALLENGE: return "challenge"; case CHAP_RESPONSE: return "response"; case CHAP_SUCCESS: return "success"; case CHAP_FAILURE: return "failure"; } case PPP_PAP: switch (type) { case PAP_REQ: return "req"; case PAP_ACK: return "ack"; case PAP_NAK: return "nak"; } } snprintf (buf, sizeof(buf), "auth/0x%x", type); return buf; } static const char * sppp_lcp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case LCP_OPT_MRU: return "mru"; case LCP_OPT_ASYNC_MAP: return "async-map"; case LCP_OPT_AUTH_PROTO: return "auth-proto"; case LCP_OPT_QUAL_PROTO: return "qual-proto"; case LCP_OPT_MAGIC: return "magic"; case LCP_OPT_PROTO_COMP: return "proto-comp"; case LCP_OPT_ADDR_COMP: return "addr-comp"; } snprintf (buf, sizeof(buf), "lcp/0x%x", opt); return buf; } #ifdef INET static const char * sppp_ipcp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case IPCP_OPT_ADDRESSES: return "addresses"; case IPCP_OPT_COMPRESSION: return "compression"; case IPCP_OPT_ADDRESS: return "address"; } snprintf (buf, sizeof(buf), "ipcp/0x%x", opt); return buf; } #endif #ifdef INET6 static const char * sppp_ipv6cp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case IPV6CP_OPT_IFID: return "ifid"; case IPV6CP_OPT_COMPRESSION: return "compression"; } sprintf (buf, "0x%x", opt); return buf; } #endif static const char * sppp_state_name(int state) { switch (state) { case STATE_INITIAL: return "initial"; case STATE_STARTING: return "starting"; case STATE_CLOSED: return "closed"; case STATE_STOPPED: return "stopped"; case STATE_CLOSING: return "closing"; case STATE_STOPPING: return "stopping"; case STATE_REQ_SENT: return "req-sent"; case STATE_ACK_RCVD: return "ack-rcvd"; case STATE_ACK_SENT: return "ack-sent"; case STATE_OPENED: return "opened"; } return "illegal"; } static const char * sppp_phase_name(enum ppp_phase phase) { switch (phase) { case PHASE_DEAD: return "dead"; case PHASE_ESTABLISH: return "establish"; case PHASE_TERMINATE: return "terminate"; case PHASE_AUTHENTICATE: return "authenticate"; case PHASE_NETWORK: return "network"; } return "illegal"; } static const char * sppp_proto_name(u_short proto) { static char buf[12]; switch (proto) { case PPP_LCP: return "lcp"; case PPP_IPCP: return "ipcp"; case PPP_PAP: return "pap"; case PPP_CHAP: return "chap"; case PPP_IPV6CP: return "ipv6cp"; } snprintf(buf, sizeof(buf), "proto/0x%x", (unsigned)proto); return buf; } static void sppp_print_bytes(const u_char *p, u_short len) { if (len) log(-1, " %*D", len, p, "-"); } static void sppp_print_string(const char *p, u_short len) { u_char c; while (len-- > 0) { c = *p++; /* * Print only ASCII chars directly. RFC 1994 recommends * using only them, but we don't rely on it. */ if (c < ' ' || c > '~') log(-1, "\\x%x", c); else log(-1, "%c", c); } } #ifdef INET static const char * sppp_dotted_quad(u_long addr) { static char s[16]; sprintf(s, "%d.%d.%d.%d", (int)((addr >> 24) & 0xff), (int)((addr >> 16) & 0xff), (int)((addr >> 8) & 0xff), (int)(addr & 0xff)); return s; } #endif static int sppp_strnlen(u_char *p, int max) { int len; for (len = 0; len < max && *p; ++p) ++len; return len; } /* a dummy, used to drop uninteresting events */ static void sppp_null(struct sppp *unused) { /* do just nothing */ } Index: head/sys/net/if_stf.c =================================================================== --- head/sys/net/if_stf.c (revision 286000) +++ head/sys/net/if_stf.c (revision 286001) @@ -1,783 +1,786 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; struct mtx sc_ro_mtx; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; /* * Note that mutable fields in the softc are not currently locked. * We do lock sc_ro in stf_output though. */ static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; extern struct domain inetdomain; static int in_stf_input(struct mbuf **, int *, int); static struct protosw in_stf_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_stf_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { int err, unit; struct stf_softc *sc; struct ifnet *ifp; /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; mtx_init(&(sc)->sc_ro_mtx, "stf ro", NULL, MTX_DEF); sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_protosw, sc); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err; err = encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); mtx_destroy(&(sc)->sc_ro_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(mod, type, data) module_t mod; int type; void *data; { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(m, off, proto, arg) const struct mbuf *m; int off; int proto; void *arg; { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; /* LINTED const cast */ m_copydata((struct mbuf *)(uintptr_t)m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; if_addr_rlock(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; if_addr_runlock(ifp); return (0); } if_addr_runlock(ifp); return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(in) struct in_addr *in; { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(sc, in, inifp) struct stf_softc *sc; struct in_addr *in; struct ifnet *inifp; /* incoming interface */ { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct sockaddr_in sin; struct rtentry *rt; bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = *in; rt = rtalloc1_fib((struct sockaddr *)&sin, 0, 0UL, sc->sc_fibnum); if (!rt || rt->rt_ifp != inifp) { #if 0 log(LOG_WARNING, "%s: packet from 0x%x dropped " "due to ingress filter\n", if_name(STF2IFP(sc)), (u_int32_t)ntohl(sin.sin_addr.s_addr)); #endif if (rt) RTFREE_LOCKED(rt); return -1; } RTFREE_LOCKED(rt); } return 0; } static int stf_checkaddr6(sc, in6, inifp) struct stf_softc *sc; struct in6_addr *in6; struct ifnet *inifp; /* incoming interface */ { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int in_stf_input(struct mbuf **mp, int *offp, int proto) { struct stf_softc *sc; struct ip *ip; struct ip6_hdr *ip6; struct mbuf *m; u_int8_t otos, itos; struct ifnet *ifp; int off; m = *mp; off = *offp; if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); sc = (struct stf_softc *)encap_getarg(m); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(ifp, cmd, data) struct ifnet *ifp; u_long cmd; caddr_t data; { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } Index: head/sys/netinet/if_ether.c =================================================================== --- head/sys/netinet/if_ether.c (revision 286000) +++ head/sys/netinet/if_ether.c (revision 286001) @@ -1,974 +1,977 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #define SIN(s) ((const struct sockaddr_in *)(s)) #define SDL(s) ((struct sockaddr_dl *)s) SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ static VNET_DEFINE(int, arp_maxtries) = 5; static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); static void arp_init(void); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; #ifdef AF_INET /* * called by in_scrubprefix() to remove entry from the table when * the interface goes away */ void arp_ifscrub(struct ifnet *ifp, uint32_t addr) { struct sockaddr_in addr4; bzero((void *)&addr4, sizeof(addr4)); addr4.sin_len = sizeof(addr4); addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = addr; IF_AFDATA_WLOCK(ifp); lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR), (struct sockaddr *)&addr4); IF_AFDATA_WUNLOCK(ifp); } #endif /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; if (lle->la_flags & LLE_STATIC) { return; } LLE_WLOCK(lle); if (callout_pending(&lle->la_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by arpresolve() below. */ LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->la_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { size_t pkts_dropped; LLE_REMREF(lle); pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } else LLE_FREE_LOCKED(lle); IF_AFDATA_UNLOCK(ifp); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } IF_ADDR_RUNLOCK(ifp); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, NULL); ARPSTAT_INC(txrequests); } /* * Resolve an IP address into an ethernet address. * On input: * ifp is the interface we use * is_gw != if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is where we want the address. * flags returns lle entry flags. * * On success, desten and flags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { struct llentry *la = 0; u_int flags = 0; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; if (pflags != NULL) *pflags = 0; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } retry: IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); IF_AFDATA_RUNLOCK(ifp); if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { flags |= (LLE_CREATE | LLE_EXCLUSIVE); IF_AFDATA_WLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); IF_AFDATA_WUNLOCK(ifp); } if (la == NULL) { if (flags & LLE_CREATE) log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname); m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { bcopy(&la->ll_addr, desten, ifp->if_addrlen); renew = 0; /* * If entry has an expiry time and it is approaching, * see if we need to send an ARP request within this * arpt_down interval. */ if (!(la->la_flags & LLE_STATIC) && time_uptime + la->la_preempt > la->la_expire) { renew = 1; la->la_preempt--; } if (pflags != NULL) *pflags = la->la_flags; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(la); else LLE_RUNLOCK(la); if (renew == 1) arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (0); } if (la->la_flags & LLE_STATIC) { /* should not happen! */ log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", inet_ntoa(SIN(dst)->sin_addr)); m_freem(m); error = EINVAL; goto done; } renew = (la->la_asked == 0 || la->la_expire != time_uptime); if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { flags |= LLE_EXCLUSIVE; LLE_RUNLOCK(la); goto retry; } /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; if (renew == 0 && (flags & LLE_EXCLUSIVE)) { flags &= ~LLE_EXCLUSIVE; LLE_DOWNGRADE(la); } } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->la_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } done: if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(la); else LLE_RUNLOCK(la); return (error); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n"); return; } ar = mtod(m, struct arphdr *); if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && ntohs(ar->ar_hrd) != ARPHRD_ARCNET && ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 && ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) { log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)" " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "", ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":", ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":"); m_freem(m); return; } if (m->m_len < arphdr_len(ar)) { if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { log(LOG_NOTICE, "arp: runt packet\n"); m_freem(m); return; } ar = mtod(m, struct arphdr *); } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); #define ARP_LOG(pri, ...) do { \ if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) static void in_arpinput(struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL; struct rtentry *rt; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op, flags; int req_len; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { ARP_LOG(LOG_NOTICE, "runt packet -- m_pullup failed\n"); return; } ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); /* * If bridging, fall back to using any inet address. */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " "%s!\n", inet_ntoa(isaddr)); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0; flags |= LLE_EXCLUSIVE; IF_AFDATA_LOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); IF_AFDATA_UNLOCK(ifp); if (la != NULL) { /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", inet_ntoa(isaddr), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); goto reply; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); goto reply; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", inet_ntoa(isaddr), ifp->if_addrlen, (u_char *)&la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } if (ifp->if_addrlen != ah->ar_hln) { LLE_WUNLOCK(la); ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; canceled = callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { struct mbuf *m_hold, *m_hold_next; m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; memcpy(&sa, L3_ADDR(la), sizeof(sa)); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { struct llentry *lle = NULL; sin.sin_addr = itaddr; IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_RUNLOCK(ifp); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; sin.sin_addr = itaddr; /* XXX MRT use table 0 for arp reply */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (!rt->rt_ifp || rt->rt_ifp == ifp) { RTFREE_LOCKED(rt); goto drop; } RTFREE_LOCKED(rt); (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ sin.sin_addr = isaddr; /* XXX MRT use table 0 for arp checks */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; if (rt->rt_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s, expecting %s\n", inet_ntoa(isaddr), ifp->if_xname, rt->rt_ifp->if_xname); RTFREE_LOCKED(rt); goto drop; } RTFREE_LOCKED(rt); #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa(itaddr)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa(itaddr)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, NULL); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { struct llentry *lle; if (ifa->ifa_carp != NULL) return; if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); /* * interface address is considered static entry * because the output of the arp utility shows * that L2 entry as permanent */ IF_AFDATA_LOCK(ifp); lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC), (struct sockaddr *)IA_SIN(ifa)); IF_AFDATA_UNLOCK(ifp); if (lle == NULL) log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); else LLE_RUNLOCK(lle); } ifa->ifa_rtrequest = NULL; } void arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr) { if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, enaddr); ifa->ifa_rtrequest = NULL; } static void arp_init(void) { netisr_register(&arp_nh); } SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 286000) +++ head/sys/netinet/igmp.c (revision 286001) @@ -1,3652 +1,3657 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } #ifdef KTR static __inline char * inet_ntoa_haddr(in_addr_t haddr) { struct in_addr ia; ia.s_addr = htonl(haddr); return (inet_ntoa(ia)); } #endif /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; SLIST_INIT(&igi->igi_relinmhead); mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) { SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } inm_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); /* * Free the in_multi reference(s) for this IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), ("%s: there are dangling in_multi references", __func__)); free(igi, M_IGMP); return; } } #ifdef INVARIANTS panic("%s: igmp_ifsoftc not found for ifp %p\n", __func__, ifp); #endif } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a recieved IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occured. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; IN_MULTI_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(igi, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { struct in_multi *tinm; igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; IN_MULTI_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) inm_release_locked(inm); inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void igmp_init(void *unused __unused) { CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); } SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL); static void igmp_uninit(void *unused __unused) { CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); } SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL); static void vnet_igmp_init(const void *unused __unused) { CTR1(KTR_IGMPV3, "%s: initializing", __func__); LIST_INIT(&V_igi_head); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { CTR1(KTR_IGMPV3, "%s: tearing down", __func__); KASSERT(LIST_EMPTY(&V_igi_head), ("%s: igi list not empty; ifnets not detached?", __func__)); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_uninit, NULL); static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 286000) +++ head/sys/netinet/in.c (revision 286001) @@ -1,1292 +1,1299 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include +#include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { + struct rm_priotracker in_ifa_tracker; register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { + struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { register u_long i = ntohl(in.s_addr); register u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { register char *cplim = (char *) &ap->sin_addr; register char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); IN_MULTI_LOCK(); if (ii->ii_allhosts) { (void)in_leavegroup_locked(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } IN_MULTI_UNLOCK(); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; struct sockaddr_in prefix0, mask0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; eia = in_localip_more(target); if (eia != NULL) { int fibnum = target->ia_ifp->if_fib; error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr, fibnum); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } if (!(target->ia_ifp->if_flags & IFF_NOARP)) /* remove arp cache */ arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr); } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); return (0); } - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ bzero(&prefix0, sizeof(prefix0)); prefix0.sin_len = sizeof(prefix0); prefix0.sin_family = AF_INET; prefix0.sin_addr.s_addr = target->ia_subnet; bzero(&mask0, sizeof(mask0)); mask0.sin_len = sizeof(mask0); mask0.sin_family = AF_INET; mask0.sin_addr.s_addr = target->ia_subnetmask; lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, (struct sockaddr *)&mask0, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; u_long t; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match * with a broadcast address. */ #define ia ((struct in_ifaddr *)ifa) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && t == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff) return (1); return (0); #undef ia } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in_multi) purgeinms; struct in_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN_MULTI_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, inm_link); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { LIST_REMOVE(inm, inm_link); inm_release_locked(inm); } igmp_ifdetach(ifp); IN_MULTI_UNLOCK(); } struct in_llentry { struct llentry base; struct sockaddr_in l3_addr4; }; /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. */ static void in_lltable_free(struct lltable *llt, struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in_lltable_new(const struct sockaddr *l3addr, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->l3_addr4 = *(const struct sockaddr_in *)l3addr; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_free; LLE_LOCK_INIT(&lle->base); callout_init(&lle->base.la_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) static void in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; struct llentry *lle, *next; int i; size_t pkts_dropped; IF_AFDATA_WLOCK(llt->llt_ifp); for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { /* * (flags & LLE_STATIC) means deleting all entries * including static ARP entries. */ if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)), pfx, msk) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) { LLE_WLOCK(lle); if (callout_stop(&lle->la_timer)) LLE_REMREF(lle); pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } } } IF_AFDATA_WUNLOCK(llt->llt_ifp); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* XXX rtalloc1_fib should take a const param */ rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0, ifp->if_fib); if (rt == NULL) return (EINVAL); /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt->rt_flags & RTF_GATEWAY) { if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || rt->rt_ifp->if_type != IFT_ETHER || (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { RTFREE_LOCKED(rt); return (EINVAL); } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) { const char *sa, *mask, *addr, *lim; int len; mask = (const char *)rt_mask(rt); /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if (mask == NULL) { RTFREE_LOCKED(rt); return (EINVAL); } sa = (const char *)rt_key(rt); addr = (const char *)l3addr; len = ((const struct sockaddr_in *)l3addr)->sin_len; lim = addr + len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); #endif RTFREE_LOCKED(rt); return (EINVAL); } } } RTFREE_LOCKED(rt); return (0); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; struct llentries *lleh; u_int hashkey; IF_AFDATA_LOCK_ASSERT(ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); hashkey = sin->sin_addr.s_addr; lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; LIST_FOREACH(lle, lleh, lle_next) { struct sockaddr_in *sa2 = satosin(L3_ADDR(lle)); if (lle->la_flags & LLE_DELETED) continue; if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr) break; } if (lle == NULL) { #ifdef DIAGNOSTIC if (flags & LLE_DELETE) log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); #endif if (!(flags & LLE_CREATE)) return (NULL); IF_AFDATA_WLOCK_ASSERT(ifp); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) goto done; lle = in_lltable_new(l3addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); goto done; } lle->la_flags = flags & ~LLE_CREATE; if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); lle->la_flags |= (LLE_VALID | LLE_STATIC); } lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } else if (flags & LLE_DELETE) { if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif if ((lle->la_flags & (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) llentry_free(lle); else LLE_WUNLOCK(lle); } lle = (void *)-1; } if (LLE_IS_VALID(lle)) { if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); } done: return (lle); } static int in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) { #define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle)) struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; int error, i; LLTABLE_LOCK_ASSERT(); error = 0; for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { struct sockaddr_dl *sdl; /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) continue; /* Skip if jailed and not a valid IP of the prison. */ if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) continue; /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ bzero(&arpc, sizeof(arpc)); arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; arpc.sin.sin_family = AF_INET; arpc.sin.sin_len = sizeof(arpc.sin); arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); if (error) break; } } return error; #undef SIN } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; struct lltable *llt; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); llt = lltable_init(ifp, AF_INET); if (llt != NULL) { llt->llt_prefix_free = in_lltable_prefix_free; llt->llt_lookup = in_lltable_lookup; llt->llt_dump = in_lltable_dump; } ii->ii_llt = llt; ii->ii_igmp = igmp_domifattach(ifp); return ii; } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 286000) +++ head/sys/netinet/in_mcast.c (revision 286001) @@ -1,3008 +1,3012 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static void inp_freemoptions_internal(struct ip_moptions *); static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); static STAILQ_HEAD(, ip_moptions) imo_gc_list = STAILQ_HEAD_INITIALIZER(imo_gc_list); static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET) { inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; IN_MULTI_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); ++inm->inm_refcount; *pinm = inm; return (0); } memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa(*group)); #endif ++inm->inm_refcount; *pinm = inm; IF_ADDR_WUNLOCK(ifp); return (0); } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { if_delmulti_ifma(ifma); IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; IF_ADDR_WUNLOCK(ifp); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ void inm_release_locked(struct in_multi *inm) { struct ifmultiaddr *ifma; IN_MULTI_LOCK_ASSERT(); CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); if (--inm->inm_refcount > 0) { CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, inm->inm_refcount); return; } CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == inm, ("%s: ifma_protospec != inm", __func__)); ifma->ifma_protospec = NULL; inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma(ifma); } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occured (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; #ifdef KTR struct in_addr ia; #endif find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR ia.s_addr = htonl(haddr); CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__, inet_ntoa(ia), ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR struct in_addr ia; ia.s_addr = htonl(ims->ims_haddr); #endif if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s", __func__, n, inet_ntoa(ia)); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s", __func__, n, inet_ntoa(ia)); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s", __func__, n, inet_ntoa(ia)); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s", __func__, n, inet_ntoa(ia)); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__, inet_ntoa(*gina), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_locked(inm); } else { *pinm = inm; } return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; error = 0; IN_MULTI_LOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, inet_ntoa(inm->inm_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_locked(inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. * This KPI is for legacy kernel consumers only. */ void in_delmulti(struct in_multi *inm) { (void)in_leavegroup(inm, NULL); } /*#endif*/ /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", __func__, inet_ntoa(mreqs.imr_interface), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is performed asynchronously in a separate task. * * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo) { KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); IN_MULTI_LOCK(); STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); IN_MULTI_UNLOCK(); taskqueue_enqueue(taskqueue_thread, &imo_gc_task); } static void inp_freemoptions_internal(struct ip_moptions *imo) { struct in_mfilter *imf; size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); (void)in_leavegroup(imo->imo_membership[idx], imf); if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } static void inp_gcmoptions(void *context, int pending) { struct ip_moptions *imo; IN_MULTI_LOCK(); while (!STAILQ_EMPTY(&imo_gc_list)) { imo = STAILQ_FIRST(&imo_gc_list); STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); IN_MULTI_UNLOCK(); inp_freemoptions_internal(imo); IN_MULTI_LOCK(); } IN_MULTI_UNLOCK(); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { + struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); } } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { + struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { struct route ro; ro.ro_rt = NULL; memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0); if (ro.ro_rt != NULL) { ifp = ro.ro_rt->rt_ifp; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); RTFREE(ro.ro_rt); } else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", __func__, inet_ntoa(mreqs.imr_interface), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); IN_MULTI_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); goto out_in_multi_locked; } } out_in_multi_locked: IN_MULTI_UNLOCK(); INP_WLOCK_ASSERT(inp); if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", __func__, inet_ntoa(mreqs.imr_interface), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, inet_ntoa(ssa->sin.sin_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } out_in_multi_locked: IN_MULTI_UNLOCK(); if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; } imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp, inet_ntoa(addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group %s is not multicast", __func__, inet_ntoa(group)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); IN_MULTI_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { #ifdef KTR struct in_addr ina; ina.s_addr = htonl(ims->ims_haddr); CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa(ina)); #endif /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); IN_MULTI_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 286000) +++ head/sys/netinet/in_pcb.c (revision 286001) @@ -1,2618 +1,2621 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { + struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) return (0); KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); if (inp->in6p_moptions != NULL) ip6_freemoptions(inp->in6p_moptions); } #endif if (inp->inp_options) (void)m_free(inp->inp_options); #ifdef INET if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ Index: head/sys/netinet/in_var.h =================================================================== --- head/sys/netinet/in_var.h (revision 286000) +++ head/sys/netinet/in_var.h (revision 286001) @@ -1,400 +1,401 @@ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ /* * Argument structure for SIOCAIFADDR. */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; int ifra_vhid; }; #ifdef _KERNEL #include #include #include struct igmp_ifsoftc; struct in_multi; struct lltable; /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_subnet{,mask} in host order */ u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* * Hash table for IP addresses. */ TAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET(in_ifaddrhead) #define V_in_ifaddrhmask VNET(in_ifaddrhmask) #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) -extern struct rwlock in_ifaddr_lock; +extern struct rmlock in_ifaddr_lock; -#define IN_IFADDR_LOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_LOCKED) -#define IN_IFADDR_RLOCK() rw_rlock(&in_ifaddr_lock) -#define IN_IFADDR_RLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_RLOCKED) -#define IN_IFADDR_RUNLOCK() rw_runlock(&in_ifaddr_lock) -#define IN_IFADDR_WLOCK() rw_wlock(&in_ifaddr_lock) -#define IN_IFADDR_WLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_WLOCKED) -#define IN_IFADDR_WUNLOCK() rw_wunlock(&in_ifaddr_lock) +#define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) +#define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) +#define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) +#define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) +#define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) +#define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) +#define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ -#define IFP_TO_IA(ifp, ia) \ +#define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ + /* struct rm_priotracker *t; */ \ do { \ - IN_IFADDR_RLOCK(); \ + IN_IFADDR_RLOCK((t)); \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ ifa_ref(&(ia)->ia_ifa); \ - IN_IFADDR_RUNLOCK(); \ + IN_IFADDR_RUNLOCK((t)); \ } while (0) /* * Legacy IPv4 IGMP per-link structure. */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; }; /* * IPv4 multicast IGMP-layer source entry. */ struct ip_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ struct ims_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } ims_st[2]; /* state at t0, t1 */ uint8_t ims_stp; /* pending query */ }; /* * IPv4 multicast PCB-layer source entry. */ struct in_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ uint8_t imsl_st[2]; /* state before/at commit */ }; RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ static __inline int ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) { if (a->ims_haddr < b->ims_haddr) return (-1); if (a->ims_haddr == b->ims_haddr) return (0); return (1); } RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); /* * IPv4 multicast PCB-layer group filter descriptor. */ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ }; /* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When IGMPv3 is active, inm_timer is the response to group query timer. * The state-change timer inm_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: inm_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ uint16_t inm_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing IGMPv3 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct inm_st { uint16_t iss_fmode; /* IGMP filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } inm_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, uint8_t t) { t = !!t; if (inm->inm_st[t].iss_ex > 0 && inm->inm_st[t].iss_ex == ims->ims_st[t].ex) return (MCAST_EXCLUDE); else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in_multi_mtx; #define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) #define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) #define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) #define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); ++inm->inm_refcount; } /* * Return values for imo_multi_filter(). */ #define MCAST_PASS 0 /* Pass */ #define MCAST_NOTGMEMBER 1 /* This host not a member of group */ #define MCAST_NOTSMEMBER 2 /* This host excluded source */ #define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; struct radix_node_head; struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); void inm_release(struct in_multi *); void inm_release_locked(struct in_multi *); struct in_multi * in_addmulti(struct in_addr *, struct ifnet *); void in_delmulti(struct in_multi *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void ip_input(struct mbuf *); void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_fastforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtalloc(struct route *ro, u_int fibnum); struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ Index: head/sys/netinet/ip_icmp.c =================================================================== --- head/sys/netinet/ip_icmp.c (revision 286000) +++ head/sys/netinet/ip_icmp.c (revision 286001) @@ -1,966 +1,969 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static VNET_DEFINE(int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); static VNET_DEFINE(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets."); static VNET_DEFINE(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets."); VNET_DEFINE(int, drop_redirect) = 0; #define V_drop_redirect VNET(drop_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); static VNET_DEFINE(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "icmp reply source for non-local packets."); static VNET_DEFINE(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); static VNET_DEFINE(int, icmp_quotelen) = 8; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); /* * ICMP broadcast echo sysctl */ static VNET_DEFINE(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, ""); static VNET_DEFINE(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen, icmpelen, nlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ if (oiphlen + 8 > n->m_len) goto freeit; /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunatly this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)) goto freeit; th = (struct tcphdr *)((caddr_t)oip + oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) goto freeit; icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, ICMP_MINLEN + icmplen); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); ICMPSTAT_INC(icps_outhist[type]); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_src)); printf("icmp_input from %s to %s, len %d\n", buf, inet_ntoa(ip->ip_dst), icmplen); } #endif if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; ICMPSTAT_INC(icps_inhist[icp->icmp_type]); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; else goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; else goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; if (ia->ia_ifp == NULL) { ifa_free(&ia->ia_ifa); break; } icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } ifa_free(&ia->ia_ifa); reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); printf("redirect dst %s to %s\n", buf, inet_ntoa(icp->icmp_gwaddr)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_SOURCEQUENCH: default: break; } raw: *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct mbuf *opts = 0; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); if (ia == NULL) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == 0 && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { register struct ip *ip = mtod(m, struct ip *); register int hlen; register struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); printf("icmp_send dst %s src %s\n", buf, inet_ntoa(ip->ip_src)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ int badport_bandlim(int which) { #define N(a) (sizeof (a) / sizeof (a[0])) static struct rate { const char *type; struct timeval lasttime; int curpps; } rates[BANDLIM_MAX+1] = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" }, { "sctp ootb response" } }; /* * Return ok status if feature disabled or argument out of range. */ if (V_icmplim > 0 && (u_int) which < N(rates)) { struct rate *r = &rates[which]; int opps = r->curpps; if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim)) return -1; /* discard packet */ /* * If we've dropped below the threshold after having * rate-limited traffic print the message. This preserves * the previous behaviour at the expense of added complexity. */ if (V_icmplim_output && opps > V_icmplim) log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n", r->type, opps, V_icmplim); } return 0; /* okay to send packet */ #undef N } Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 286000) +++ head/sys/netinet/ip_input.c (revision 286001) @@ -1,1334 +1,1335 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /* IPSEC */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif -struct rwlock in_ifaddr_lock; -RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); +struct rmlock in_ifaddr_lock; +RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE void ip_destroy(void) { int i; if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, i); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); /* Destroy IP reassembly queue. */ ipreass_destroy(); } #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); if (dchg != 0) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m, ip->ip_p) != 0) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } /* * Given address of next destination (final or next hop), return (referenced) * internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(struct in_addr dst, u_int fibnum) { struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ia; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; in_rtalloc_ign(&sro, 0, fibnum); if (sro.ro_rt == NULL) return (NULL); ia = ifatoia(sro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); RTFREE(sro.ro_rt); return (ia); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #ifdef IPSEC if (ip_ipsec_fwd(m) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #endif /* IPSEC */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); } else ia = NULL; #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. * In case of IPsec, Don't discard it just yet, but pass it to * ip_output in case of outgoing IPsec policy. */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); RO_RTFREE(&ro); return; } #endif /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } } if (mcopy == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return; } switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(mcopy, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } if (ia != NULL) ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 286000) +++ head/sys/netinet/ip_output.c (revision 286001) @@ -1,1369 +1,1372 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { + struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ struct in_addr odst; struct m_tag *fwd_tag = NULL; uint32_t fibnum; int have_ia_ref; int needfiblookup; #ifdef IPSEC int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: ia = NULL; have_ia_ref = 0; /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. * The address family should also be checked in case of sharing * the cache with IPv6. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RO_RTFREE(ro); ro->ro_lle = NULL; rte = NULL; gw = dst; } if (rte == NULL && fwd_tag == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia) have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(gw->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) mtu = rte->rt_mtu; else mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, dst, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &error)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IP, m); goto done; } else { if (have_ia_ref) ifa_free(&ia->ia_ifa); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); if (have_ia_ref) ifa_free(&ia->ia_ifa); goto again; } passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, int hlen) { register struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif if_simloop(ifp, copym, dst->sin_family, 0); } } Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 286000) +++ head/sys/netinet/raw_ip.c (revision 286001) @@ -1,1129 +1,1131 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip_mrouter); /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) #ifdef INET static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); LIST_REMOVE(inp, inp_hash); } #endif /* INET */ /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE void rip_destroy(void) { in_pcbinfo_destroy(&V_ripcbinfo); } #endif #ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; ifp = m->m_pkthdr.rcvif; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (!in_nullhost(inp->inp_laddr) && !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. * * In the case of IGMPv2, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. imo_multi_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if (proto != IPPROTO_IGMP) { struct sockaddr_in group; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, (struct sockaddr *)&group, (struct sockaddr *)&ripsrc); } if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); continue; } } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { m_freem(m); IPSTAT_INC(ips_noproto); IPSTAT_DEC(ips_delivered); } return (IPPROTO_DONE); } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); va_list ap; u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; va_start(ap, so); dst = va_arg(ap, u_long); va_end(ap); /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = htons(IP_DF); else ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not * want to see from jails. */ if (ip->ip_src.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, inp->inp_cred); } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); error = prison_check_ip4(inp->inp_cred, &ip->ip_src); if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ntohs(ip->ip_len) > m->m_pkthdr.len) || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } /* * This doesn't allow application to specify ID of zero, * but we got this limitation from the beginning of history. */ if (ip->ip_id == 0) ip_fillid(ip); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IPSTAT_INC(ips_rawout); } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. * * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp->inp_inc.inc_fibnum = so->so_fibnum; return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); ifa_free(&ia->ia_ifa); break; } } if (ia == NULL) /* If ia matched, already unlocked. */ - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); break; case PRC_IFUP: - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return; } ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); ifa_free(&ia->ia_ifa); break; } } static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); rip_dodisconnect(so, inp); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); rip_dodisconnect(so, inp); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); rip_dodisconnect(so, inp); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; int error; if (nam->sa_len != sizeof(*addr)) return (EINVAL); error = prison_check_ip4(td->td_ucred, &addr->sin_addr); if (error != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } #endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_ripcbinfo); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; #endif /* INET */ Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 286000) +++ head/sys/netinet6/in6.c (revision 286001) @@ -1,2427 +1,2433 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; /* Announce arrival of local address to all FIBs. */ rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGDRLST_IN6: case SIOCGPRLST_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCSIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; case SIOCSIFALIFETIME_IN6: { struct in6_addrlifetime *lt; if (td != NULL) { error = priv_check(td, PRIV_NETINET_ALIFETIME6); if (error) goto out; } if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* sanity for overflow - beware unsigned */ lt = &ifr->ifr_ifru.ifru_lifetime; if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME && lt->ia6t_vltime + time_uptime < time_uptime) { error = EINVAL; goto out; } if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME && lt->ia6t_pltime + time_uptime < time_uptime) { error = EINVAL; goto out; } break; } } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCSIFALIFETIME_IN6: ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime; /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) ifa_free(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); goto out; } if (pr == NULL) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); log(LOG_ERR, "nd6_prelist_add succeeded but " "no prefix\n"); error = EINVAL; goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_refcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: if (error != 0 || ia == NULL) break; /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_refcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr && pr->ndpr_refcnt == 0) prelist_remove(pr); EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ if (hostIsNew && in6if_do_dad(ifp)) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* DAD should be performed after ND6_IFF_IFDISABLED is cleared. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* * Perform DAD, if needed. * XXX It may be of use, if we can administratively disable DAD. */ if (in6if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && (ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Remove local address entry from lltable. */ nd6_rem_ifa_lle(ia); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6) ? RTF_HOST : 0); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_refcnt--; ia->ia6_ndpr = NULL; } /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other other subsystems about address change/arrival: * 1) Notifies device handler on first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) return (error); } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) return (error); ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the internet address corresponding to a given address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { + struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } if_addr_runlock(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { + struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { + struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { + struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = 0; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionaly, users may try to assign addresses * before the interface becomes up (or running). * We simply skip DAD in such a case as a work around. * XXX: we should rather mark "tentative" on such addresses, * and do DAD after the interface becomes ready. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_IEEE80211: /* ditto */ case IFT_INFINIBAND: return (64); case IFT_FDDI: /* RFC2467 */ return (64); case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_ARCNET: /* RFC2497 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; struct sockaddr_in6 l3_addr6; }; /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. */ static void in6_lltable_free(struct lltable *llt, struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in6_lltable_new(const struct sockaddr *l3addr, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->l3_addr6 = *(const struct sockaddr_in6 *)l3addr; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_free; LLE_LOCK_INIT(&lle->base); callout_init(&lle->base.ln_timer_ch, 1); return (&lle->base); } static void in6_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix; const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask; struct llentry *lle, *next; int i; /* * (flags & LLE_STATIC) means deleting all entries * including static ND6 entries. */ IF_AFDATA_WLOCK(llt->llt_ifp); for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { if (IN6_ARE_MASKED_ADDR_EQUAL( &satosin6(L3_ADDR(lle))->sin6_addr, &pfx->sin6_addr, &msk->sin6_addr) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) { LLE_WLOCK(lle); if (callout_stop(&lle->la_timer)) LLE_REMREF(lle); llentry_free(lle); } } } IF_AFDATA_WUNLOCK(llt->llt_ifp); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* Our local addresses are always only installed on the default FIB. */ /* XXX rtalloc1 should take a const param */ rt = in6_rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0, RT_DEFAULT_FIB); if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ /* XXX ifaof_ifpforaddr should take a const param */ ifa = ifaof_ifpforaddr(__DECONST(struct sockaddr *, l3addr), ifp); if (ifa != NULL) { ifa_free(ifa); if (rt != NULL) RTFREE_LOCKED(rt); return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &((const struct sockaddr_in6 *)l3addr)->sin6_addr)); if (rt != NULL) RTFREE_LOCKED(rt); return EINVAL; } RTFREE_LOCKED(rt); return 0; } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; struct llentries *lleh; u_int hashkey; IF_AFDATA_LOCK_ASSERT(ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); hashkey = sin6->sin6_addr.s6_addr32[3]; lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; LIST_FOREACH(lle, lleh, lle_next) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)L3_ADDR(lle); if (lle->la_flags & LLE_DELETED) continue; if (bcmp(&sa6->sin6_addr, &sin6->sin6_addr, sizeof(struct in6_addr)) == 0) break; } if (lle == NULL) { if (!(flags & LLE_CREATE)) return (NULL); IF_AFDATA_WLOCK_ASSERT(ifp); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return NULL; lle = in6_lltable_new(l3addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return NULL; } lle->la_flags = flags & ~LLE_CREATE; if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); lle->la_flags |= (LLE_VALID | LLE_STATIC); } lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } else if (flags & LLE_DELETE) { if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif if ((lle->la_flags & (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) llentry_free(lle); else LLE_WUNLOCK(lle); } lle = (void *)-1; } if (LLE_IS_VALID(lle)) { if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); } return (lle); } static int in6_lltable_dump(struct lltable *llt, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; int i, error; if (ifp->if_flags & IFF_LOOPBACK) return 0; LLTABLE_LOCK_ASSERT(); error = 0; for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { struct sockaddr_dl *sdl; /* skip deleted or invalid entries */ if ((lle->la_flags & (LLE_DELETED|LLE_VALID)) != LLE_VALID) continue; /* Skip if jailed and not a valid IP of the prison. */ if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) continue; /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ bzero(&ndpc, sizeof(ndpc)); ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; ndpc.sin6.sin6_family = AF_INET6; ndpc.sin6.sin6_len = sizeof(ndpc.sin6); bcopy(L3_ADDR(lle), &ndpc.sin6, L3_ADDR_LEN(lle)); if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_alen = ifp->if_addrlen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); ndpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); if (error) break; } } return error; } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = lltable_init(ifp, AF_INET6); if (ext->lltable != NULL) { ext->lltable->llt_prefix_free = in6_lltable_prefix_free; ext->lltable->llt_lookup = in6_lltable_lookup; ext->lltable->llt_dump = in6_lltable_dump; } ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/in6_ifattach.c =================================================================== --- head/sys/netinet6/in6_ifattach.c (revision 286000) +++ head/sys/netinet6/in6_ifattach.c (revision 286001) @@ -1,896 +1,898 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } IF_ADDR_RUNLOCK(ifp); return -1; found: IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_FDDI: case IFT_ISO88025: case IFT_ATM: case IFT_IEEE1394: case IFT_IEEE80211: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { IF_ADDR_RUNLOCK(ifp); return -1; } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } if (bcmp(addr, allone, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_ARCNET: if (addrlen != 1) { IF_ADDR_RUNLOCK(ifp); return -1; } if (!addr[0]) { IF_ADDR_RUNLOCK(ifp); return -1; } bzero(&in6->s6_addr[8], 8); in6->s6_addr[15] = addr[0]; /* * due to insufficient bitwidth, we mark it local. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ IF_ADDR_RUNLOCK(ifp); return -1; default: IF_ADDR_RUNLOCK(ifp); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { IF_ADDR_RUNLOCK(ifp); return -1; } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } IF_ADDR_RUNLOCK(ifp); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK_NOSLEEP(); goto success; } } IFNET_RUNLOCK_NOSLEEP(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp)); ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if (nd6_prefix_lookup(&pr0) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* we don't need to perform DAD on loopback interfaces. */ ifra.ifra_flags |= IN6_IFF_NODAD; /* skip registration to the prefix list. XXX should be temporary. */ ifra.ifra_flags |= IN6_IFF_NOPFX; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_addr in6; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { struct ifaddr *ifa; in6 = in6addr_loopback; ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &in6); if (ifa == NULL) { if (in6_ifattach_loopback(ifp) != 0) return; } else ifa_free(ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { int error; ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { error = in6_ifattach_linklocal(ifp, altifp); #if 0 if (error) log(LOG_NOTICE, "in6_ifattach_linklocal: " "failed to add a link-local addr to %s\n", if_name(ifp)); #endif } else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * We don't need this function in bsdi, because interfaces are never removed * from the ifnet list in bsdi. */ void in6_ifdetach(struct ifnet *ifp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* remove neighbor management table */ nd6_purge(ifp); /* * nuke any of IPv6 addresses we have * XXX: all addresses should be already removed */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * remove neighbor management table. we call it twice just to make * sure we nuke everything. maybe we need just one call. * XXX: since the first call did not release addresses, some prefixes * might remain. We should call nd6_purge() again to release the * prefixes after removing all addresses above. * (Or can we just delay calling nd6_purge until at this point?) */ nd6_purge(ifp); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in6_multi) purgeinms; struct in6_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN6_MULTI_LOCK(); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, in6m_entry); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, in6m_entry, tinm) { LIST_REMOVE(inm, in6m_entry); in6m_release_locked(inm); } mld_ifdetach(ifp); IN6_MULTI_UNLOCK(); } Index: head/sys/netinet6/in6_src.c =================================================================== --- head/sys/netinet6/in6_src.c (revision 286000) +++ head/sys/netinet6/in6_src.c (revision 286001) @@ -1,1213 +1,1215 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx addrsel_lock; #define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF) #define ADDRSEL_LOCK() mtx_lock(&addrsel_lock) #define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock) #define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED) static struct sx addrsel_sxlock; #define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock") #define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock) #define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock) #define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock) #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) static VNET_DEFINE(struct in6_addrpolicy, defaultaddrpolicy); #define V_defaultaddrpolicy VNET(defaultaddrpolicy) VNET_DEFINE(int, ip6_prefer_tempaddr) = 0; static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct rtentry **, int, u_int); static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *ro, struct ifnet **, struct ifnet *, u_int); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *), void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #define REPLACE(r) do {\ IP6STAT_INC(ip6s_sources_rule[(r)]); \ rule = (r); \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto replace; \ } while(0) #define NEXT(r) do {\ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ IP6STAT_INC(ip6s_sources_rule[(r)]); \ rule = (r); \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) int in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct route_in6 *ro, struct ucred *cred, struct ifnet **ifpp, struct in6_addr *srcp) { + struct rm_priotracker in6_ifa_tracker; struct in6_addr dst, tmp; struct ifnet *ifp = NULL, *oifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; u_int32_t odstzone; int prefer_tempaddr; int error, rule; struct ip6_moptions *mopts; KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__)); dst = dstsock->sin6_addr; /* make a copy for local operation */ if (ifpp) { /* * Save a possibly passed in ifp for in6_selectsrc. Only * neighbor discovery code should use this feature, where * we may know the interface but not the FIB number holding * the connected subnet in case someone deleted it from the * default FIB and we need to check the interface. */ if (*ifpp != NULL) oifp = *ifpp; *ifpp = NULL; } if (inp != NULL) { INP_LOCK_ASSERT(inp); mopts = inp->in6p_moptions; } else { mopts = NULL; } /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { struct sockaddr_in6 srcsock; struct in6_ifaddr *ia6; /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp, (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB)) != 0) return (error); /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ bzero(&srcsock, sizeof(srcsock)); srcsock.sin6_family = AF_INET6; srcsock.sin6_len = sizeof(srcsock); srcsock.sin6_addr = pi->ipi6_addr; if (ifp) { error = in6_setscope(&srcsock.sin6_addr, ifp, NULL); if (error) return (error); } if (cred != NULL && (error = prison_local_ip6(cred, &srcsock.sin6_addr, (inp != NULL && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); ia6 = (struct in6_ifaddr *)ifa_ifwithaddr( (struct sockaddr *)&srcsock); if (ia6 == NULL || (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { if (ia6 != NULL) ifa_free(&ia6->ia_ifa); return (EADDRNOTAVAIL); } pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; bcopy(&ia6->ia_addr.sin6_addr, srcp, sizeof(*srcp)); ifa_free(&ia6->ia_ifa); return (0); } /* * Otherwise, if the socket has already bound the source, just use it. */ if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (cred != NULL && (error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp)); return (0); } /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip6(cred, srcp)) return (0); /* * If the address is not specified, choose the best one based on * the outgoing interface and the destination address. */ /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp, (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB)) != 0) return (error); #ifdef DIAGNOSTIC if (ifp == NULL) /* this should not happen */ panic("in6_selectsrc: NULL ifp"); #endif error = in6_setscope(&dst, ifp, &odstzone); if (error) return (error); rule = 0; - IN6_IFADDR_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(&dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* If jailed only take addresses of the jail into account. */ if (cred != NULL && prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0) continue; /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(0); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(&dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. */ if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 * case above. */ /* Rule 5: Prefer outgoing interface */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) { if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); } /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = V_ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 9: prefer address with better virtual status. */ if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa)) REPLACE(9); if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa)) NEXT(9); /* * Rule 10: prefer address with `prefer_source' flag. */ if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0) REPLACE(10); if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0) NEXT(10); /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst)); next: continue; out: break; } if ((ia = ia_best) == NULL) { - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } /* * At this point at least one of the addresses belonged to the jail * but it could still be, that we want to further restrict it, e.g. * theoratically IN6_IS_ADDR_LOOPBACK. * It must not be IN6_IS_ADDR_UNSPECIFIED anymore. * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should * let all others previously selected pass. * Use tmp to not change ::1 on lo0 to the primary jail address. */ tmp = ia->ia_addr.sin6_addr; if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) { - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } if (ifpp) *ifpp = ifp; bcopy(&tmp, srcp, sizeof(*srcp)); if (ia->ia_ifp == ifp) IP6STAT_INC(ip6s_sources_sameif[best_scope]); else IP6STAT_INC(ip6s_sources_otherif[best_scope]); if (dst_scope == best_scope) IP6STAT_INC(ip6s_sources_samescope[best_scope]); else IP6STAT_INC(ip6s_sources_otherscope[best_scope]); if (IFA6_IS_DEPRECATED(ia)) IP6STAT_INC(ip6s_sources_deprecated[best_scope]); - IN6_IFADDR_RUNLOCK(); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * clone - meaningful only for bsdi and freebsd */ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum) { int error = 0; struct ifnet *ifp = NULL; struct rtentry *rt = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; uint32_t zoneid; #if 0 char ip6buf[INET6_ADDRSTRLEN]; if (dstsock->sin6_addr.s6_addr32[0] == 0 && dstsock->sin6_addr.s6_addr32[1] == 0 && !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { printf("in6_selectroute: strange destination %s\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr)); } else { printf("in6_selectroute: destination = %s%%%d\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr), dstsock->sin6_scope_id); /* for debug */ } #endif /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for * multicast. */ goto done; } else goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { goto done; /* we do not need a route for multicast. */ } /* * If destination address is LLA or link- or node-local multicast, * use it's embedded scope zone id to determine outgoing interface. */ if (IN6_IS_ADDR_MC_LINKLOCAL(dst) || IN6_IS_ADDR_MC_NODELOCAL(dst)) { zoneid = ntohs(in6_getscope(dst)); if (zoneid > 0) { ifp = in6_getlinkifnet(zoneid); goto done; } } getroute: /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route_in6 *ron; struct llentry *la; sin6_next = satosin6(opts->ip6po_nexthop); /* at this moment, we only support AF_INET6 next hops */ if (sin6_next->sin6_family != AF_INET6) { error = EAFNOSUPPORT; /* or should we proceed? */ goto done; } /* * If the next hop is an IPv6 address, then the node identified * by that address must be a neighbor of the sending host. */ ron = &opts->ip6po_nextroute; /* * XXX what do we do here? * PLZ to be fixing */ if (ron->ro_rt == NULL) { in6_rtalloc(ron, fibnum); /* multi path case? */ if (ron->ro_rt == NULL) { /* XXX-BZ WT.? */ if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } error = EHOSTUNREACH; goto done; } } rt = ron->ro_rt; ifp = rt->rt_ifp; IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)sin6_next); IF_AFDATA_RUNLOCK(ifp); if (la != NULL) LLE_RUNLOCK(la); else { error = EHOSTUNREACH; goto done; } #if 0 if ((ron->ro_rt && (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != (RTF_UP | RTF_LLINFO)) || !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, &sin6_next->sin6_addr)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } *satosin6(&ron->ro_dst) = *sin6_next; } if (ron->ro_rt == NULL) { in6_rtalloc(ron, fibnum); /* multi path case? */ if (ron->ro_rt == NULL || !(ron->ro_rt->rt_flags & RTF_LLINFO)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } error = EHOSTUNREACH; goto done; } } #endif /* * When cloning is required, try to allocate a route to the * destination so that the caller can store path MTU * information. */ goto done; } /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. */ if (ro) { if (ro->ro_rt && (!(ro->ro_rt->rt_flags & RTF_UP) || ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } if (ro->ro_rt == (struct rtentry *)NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); sa6 = (struct sockaddr_in6 *)&ro->ro_dst; *sa6 = *dstsock; sa6->sin6_scope_id = 0; #ifdef RADIX_MPATH rtalloc_mpath_fib((struct route *)ro, ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum); #else ro->ro_rt = in6_rtalloc1((struct sockaddr *) &ro->ro_dst, 0, 0UL, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); #endif } /* * do not care about the result if we have the nexthop * explicitly specified. */ if (opts && opts->ip6po_nexthop) goto done; if (ro->ro_rt) { ifp = ro->ro_rt->rt_ifp; if (ifp == NULL) { /* can this really happen? */ RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } if (ro->ro_rt == NULL) error = EHOSTUNREACH; rt = ro->ro_rt; /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (ifp && opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) { error = EHOSTUNREACH; goto done; } } } done: if (ifp == NULL && rt == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. */ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) IP6STAT_INC(ip6s_noroute); if (retifp != NULL) { *retifp = ifp; /* * Adjust the "outgoing" interface. If we're going to loop * the packet back to ourselves, the ifp would be the loopback * interface. However, we'd rather know the interface associated * to the destination address (which should probably be one of * our own addresses.) */ if (rt) { if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) && (rt->rt_gateway->sa_family == AF_LINK)) *retifp = ifnet_byindex(((struct sockaddr_dl *) rt->rt_gateway)->sdl_index); } } if (retrt != NULL) *retrt = rt; /* rt may be NULL */ return (error); } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct ifnet *oifp, u_int fibnum) { int error; struct route_in6 sro; struct rtentry *rt = NULL; KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__)); if (ro == NULL) { bzero(&sro, sizeof(sro)); ro = &sro; } if ((error = selectroute(dstsock, opts, mopts, ro, retifp, &rt, 1, fibnum)) != 0) { if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); /* Help ND. See oifp comment in in6_selectsrc(). */ if (oifp != NULL && fibnum == RT_DEFAULT_FIB) { *retifp = oifp; error = 0; } return (error); } /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (flags); } if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (0); } /* * Public wrapper function to selectroute(). * * XXX-BZ in6_selectroute() should and will grow the FIB argument. The * in6_selectroute_fib() function is only there for backward compat on stable. */ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt) { return (selectroute(dstsock, opts, mopts, ro, retifp, retrt, 0, RT_DEFAULT_FIB)); } #ifndef BURN_BRIDGES int in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, u_int fibnum) { return (selectroute(dstsock, opts, mopts, ro, retifp, retrt, 0, fibnum)); } #endif /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp) { if (in6p && in6p->in6p_hops >= 0) return (in6p->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { struct route_in6 ro6; struct ifnet *lifp; bzero(&ro6, sizeof(ro6)); ro6.ro_dst.sin6_family = AF_INET6; ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); ro6.ro_dst.sin6_addr = in6p->in6p_faddr; in6_rtalloc(&ro6, in6p->inp_inc.inc_fibnum); if (ro6.ro_rt) { lifp = ro6.ro_rt->rt_ifp; RTFREE(ro6.ro_rt); if (lifp) return (ND_IFINFO(lifp)->chlim); } } return (V_ip6_defhlim); } /* * XXX: this is borrowed from in6_pcbbind(). If possible, we should * share this function by all *bsd*... */ int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { struct socket *so = inp->inp_socket; u_int16_t lport = 0; int error, lookupflags = 0; #ifdef INVARIANTS struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #endif INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); error = prison_local_ip6(cred, laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); if (error) return(error); /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags); if (error != 0) return (error); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } return (0); } void addrsel_policy_init(void) { init_policy_queue(); /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; if (!IS_DEFAULT_VNET(curvnet)) return; ADDRSEL_LOCK_INIT(); ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) match = &V_defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct walkarg { struct sysctl_req *w_req; }; static int in6_src_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, CTLFLAG_RD, in6_src_sysctl, ""); static int in6_src_sysctl(SYSCTL_HANDLER_ARGS) { struct walkarg w; if (req->newptr) return EPERM; bzero(&w, sizeof(w)); w.w_req = req; return (walk_addrsel_policy(dump_addrsel_policyent, &w)); } int in6_src_ioctl(u_long cmd, caddr_t data) { struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr); ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); static VNET_DEFINE(struct addrsel_policyhead, addrsel_policytab); #define V_addrsel_policytab VNET(addrsel_policytab) static void init_policy_queue(void) { TAILQ_INIT(&V_addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { struct addrsel_policyent *new, *pol; new = malloc(sizeof(*new), M_IFADDR, M_WAITOK); ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* duplication check */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(new, M_IFADDR); return (EEXIST); /* or override it? */ } } bzero(new, sizeof(*new)); /* XXX: should validate entry */ new->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (ESRCH); } TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(pol, M_IFADDR); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); } } ADDRSEL_SUNLOCK(); return (error); } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct walkarg *w = arg; error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); return (error); } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); } Index: head/sys/netinet6/in6_var.h =================================================================== --- head/sys/netinet6/in6_var.h (revision 286000) +++ head/sys/netinet6/in6_var.h (revision 286001) @@ -1,840 +1,840 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_var.h,v 1.56 2001/03/29 05:34:31 itojun Exp $ */ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ /* * pltime/vltime are just for future reference (required to implements 2 * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { time_t ia6t_expire; /* valid lifetime expiration time */ time_t ia6t_preferred; /* preferred lifetime expiration time */ u_int32_t ia6t_vltime; /* valid lifetime */ u_int32_t ia6t_pltime; /* prefix lifetime */ }; struct nd_ifinfo; struct scope6_id; struct lltable; struct mld_ifsoftc; struct in6_ifextra { counter_u64_t *in6_ifstat; counter_u64_t *icmp6_ifstat; struct nd_ifinfo *nd_ifinfo; struct scope6_id *scope6_id; struct lltable *lltable; struct mld_ifsoftc *mld_ifinfo; }; #define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable) #ifdef _KERNEL struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags struct sockaddr_in6 ia_addr; /* interface address */ struct sockaddr_in6 ia_net; /* network number of interface */ struct sockaddr_in6 ia_dstaddr; /* space for destination addr */ struct sockaddr_in6 ia_prefixmask; /* prefix mask */ u_int32_t ia_plen; /* prefix length */ TAILQ_ENTRY(in6_ifaddr) ia_link; /* list of IPv6 addresses */ int ia6_flags; struct in6_addrlifetime ia6_lifetime; time_t ia6_createtime; /* the creation time of this address, which is * currently used for temporary addresses only. */ time_t ia6_updatetime; /* back pointer to the ND prefix (for autoconfigured addresses only) */ struct nd_prefix *ia6_ndpr; /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; /* entry in bucket of inet6 addresses */ LIST_ENTRY(in6_ifaddr) ia6_hash; }; /* List of in6_ifaddr's. */ TAILQ_HEAD(in6_ifaddrhead, in6_ifaddr); LIST_HEAD(in6_ifaddrlisthead, in6_ifaddr); #endif /* _KERNEL */ /* control structure to manage address selection policy */ struct in6_addrpolicy { struct sockaddr_in6 addr; /* prefix address */ struct sockaddr_in6 addrmask; /* prefix mask */ int preced; /* precedence */ int label; /* matching label */ u_quad_t use; /* statistics */ }; /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ struct in6_ifstat { uint64_t ifs6_in_receive; /* # of total input datagram */ uint64_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ uint64_t ifs6_in_toobig; /* # of datagrams exceeded MTU */ uint64_t ifs6_in_noroute; /* # of datagrams with no route */ uint64_t ifs6_in_addrerr; /* # of datagrams with invalid dst */ uint64_t ifs6_in_protounknown; /* # of datagrams with unknown proto */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_truncated; /* # of truncated datagrams */ uint64_t ifs6_in_discard; /* # of discarded datagrams */ /* NOTE: fragment timeout is not here */ uint64_t ifs6_in_deliver; /* # of datagrams delivered to ULP */ /* NOTE: increment on final dst if */ uint64_t ifs6_out_forward; /* # of datagrams forwarded */ /* NOTE: increment on outgoing if */ uint64_t ifs6_out_request; /* # of outgoing datagrams from ULP */ /* NOTE: does not include forwrads */ uint64_t ifs6_out_discard; /* # of discarded datagrams */ uint64_t ifs6_out_fragok; /* # of datagrams fragmented */ uint64_t ifs6_out_fragfail; /* # of datagrams failed on fragment */ uint64_t ifs6_out_fragcreat; /* # of fragment datagrams */ /* NOTE: this is # after fragment */ uint64_t ifs6_reass_reqd; /* # of incoming fragmented packets */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_mcast; /* # of inbound multicast datagrams */ uint64_t ifs6_out_mcast; /* # of outbound multicast datagrams */ }; /* * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry. * XXX: I'm not sure if this file is the right place for this structure... */ struct icmp6_ifstat { /* * Input statistics */ /* ipv6IfIcmpInMsgs, total # of input messages */ uint64_t ifs6_in_msg; /* ipv6IfIcmpInErrors, # of input error messages */ uint64_t ifs6_in_error; /* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */ uint64_t ifs6_in_dstunreach; /* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */ uint64_t ifs6_in_adminprohib; /* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */ uint64_t ifs6_in_timeexceed; /* ipv6IfIcmpInParmProblems, # of input parameter problem errors */ uint64_t ifs6_in_paramprob; /* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */ uint64_t ifs6_in_pkttoobig; /* ipv6IfIcmpInEchos, # of input echo requests */ uint64_t ifs6_in_echo; /* ipv6IfIcmpInEchoReplies, # of input echo replies */ uint64_t ifs6_in_echoreply; /* ipv6IfIcmpInRouterSolicits, # of input router solicitations */ uint64_t ifs6_in_routersolicit; /* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */ uint64_t ifs6_in_routeradvert; /* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */ uint64_t ifs6_in_neighborsolicit; /* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */ uint64_t ifs6_in_neighboradvert; /* ipv6IfIcmpInRedirects, # of input redirects */ uint64_t ifs6_in_redirect; /* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */ uint64_t ifs6_in_mldquery; /* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */ uint64_t ifs6_in_mldreport; /* ipv6IfIcmpInGroupMembReductions, # of input MLD done */ uint64_t ifs6_in_mlddone; /* * Output statistics. We should solve unresolved routing problem... */ /* ipv6IfIcmpOutMsgs, total # of output messages */ uint64_t ifs6_out_msg; /* ipv6IfIcmpOutErrors, # of output error messages */ uint64_t ifs6_out_error; /* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */ uint64_t ifs6_out_dstunreach; /* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */ uint64_t ifs6_out_adminprohib; /* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */ uint64_t ifs6_out_timeexceed; /* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */ uint64_t ifs6_out_paramprob; /* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */ uint64_t ifs6_out_pkttoobig; /* ipv6IfIcmpOutEchos, # of output echo requests */ uint64_t ifs6_out_echo; /* ipv6IfIcmpOutEchoReplies, # of output echo replies */ uint64_t ifs6_out_echoreply; /* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */ uint64_t ifs6_out_routersolicit; /* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */ uint64_t ifs6_out_routeradvert; /* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */ uint64_t ifs6_out_neighborsolicit; /* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */ uint64_t ifs6_out_neighboradvert; /* ipv6IfIcmpOutRedirects, # of output redirects */ uint64_t ifs6_out_redirect; /* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */ uint64_t ifs6_out_mldquery; /* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */ uint64_t ifs6_out_mldreport; /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ uint64_t ifs6_out_mlddone; }; struct in6_ifreq { char ifr_name[IFNAMSIZ]; union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; struct icmp6_ifstat ifru_icmp6stat; u_int32_t ifru_scope_id[16]; } ifr_ifru; }; struct in6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; int ifra_vhid; }; /* pre-10.x compat */ struct oin6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; }; /* prefix type macro */ #define IN6_PREFIX_ND 1 #define IN6_PREFIX_RR 2 /* * prefix related flags passed between kernel(NDP related part) and * user land command(ifconfig) and daemon(rtadvd). */ struct in6_prflags { struct prf_ra { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } prf_ra; u_char prf_reserved1; u_short prf_reserved2; /* want to put this on 4byte offset */ struct prf_rr { u_char decrvalid : 1; u_char decrprefd : 1; u_char reserved : 6; } prf_rr; u_char prf_reserved3; u_short prf_reserved4; }; struct in6_prefixreq { char ipr_name[IFNAMSIZ]; u_char ipr_origin; u_char ipr_plen; u_int32_t ipr_vltime; u_int32_t ipr_pltime; struct in6_prflags ipr_flags; struct sockaddr_in6 ipr_prefix; }; #define PR_ORIG_RA 0 #define PR_ORIG_RR 1 #define PR_ORIG_STATIC 2 #define PR_ORIG_KERNEL 3 #define ipr_raf_onlink ipr_flags.prf_ra.onlink #define ipr_raf_auto ipr_flags.prf_ra.autonomous #define ipr_statef_onlink ipr_flags.prf_state.onlink #define ipr_rrf_decrvalid ipr_flags.prf_rr.decrvalid #define ipr_rrf_decrprefd ipr_flags.prf_rr.decrprefd struct in6_rrenumreq { char irr_name[IFNAMSIZ]; u_char irr_origin; u_char irr_m_len; /* match len for matchprefix */ u_char irr_m_minlen; /* minlen for matching prefix */ u_char irr_m_maxlen; /* maxlen for matching prefix */ u_char irr_u_uselen; /* uselen for adding prefix */ u_char irr_u_keeplen; /* keeplen from matching prefix */ struct irr_raflagmask { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } irr_raflagmask; u_int32_t irr_vltime; u_int32_t irr_pltime; struct in6_prflags irr_flags; struct sockaddr_in6 irr_matchprefix; struct sockaddr_in6 irr_useprefix; }; #define irr_raf_mask_onlink irr_raflagmask.onlink #define irr_raf_mask_auto irr_raflagmask.autonomous #define irr_raf_mask_reserved irr_raflagmask.reserved #define irr_raf_onlink irr_flags.prf_ra.onlink #define irr_raf_auto irr_flags.prf_ra.autonomous #define irr_statef_onlink irr_flags.prf_state.onlink #define irr_rrf irr_flags.prf_rr #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 */ #define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr)) #define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr)) #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) #define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) #ifdef _KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #define IN6_MASK_ADDR(a, m) do { \ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ } while (0) #endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) #ifdef _KERNEL /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) #endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define OSIOCAIFADDR_IN6 _IOW('i', 26, struct oin6_aliasreq) #define SIOCAIFADDR_IN6 _IOW('i', 27, struct in6_aliasreq) #define SIOCSIFPHYADDR_IN6 _IOW('i', 70, struct in6_aliasreq) #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #define SIOCGDRLST_IN6 _IOWR('i', 74, struct in6_drlist) #ifdef _KERNEL /* XXX: SIOCGPRLST_IN6 is exposed in KAME but in6_oprlist is not. */ #define SIOCGPRLST_IN6 _IOWR('i', 75, struct in6_oprlist) #endif #ifdef _KERNEL #define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #endif #define SIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ndireq) #define SIOCSIFINFO_IN6 _IOWR('i', 109, struct in6_ndireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) #define SIOCGIFALIFETIME_IN6 _IOWR('i', 81, struct in6_ifreq) #define SIOCSIFALIFETIME_IN6 _IOWR('i', 82, struct in6_ifreq) #define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq) #define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq) #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ #define SIOCSSCOPE6 _IOW('i', 88, struct in6_ifreq) #define SIOCGSCOPE6 _IOWR('i', 89, struct in6_ifreq) #define SIOCGSCOPE6DEF _IOWR('i', 90, struct in6_ifreq) #define SIOCSIFPREFIX_IN6 _IOW('i', 100, struct in6_prefixreq) /* set */ #define SIOCGIFPREFIX_IN6 _IOWR('i', 101, struct in6_prefixreq) /* get */ #define SIOCDIFPREFIX_IN6 _IOW('i', 102, struct in6_prefixreq) /* del */ #define SIOCAIFPREFIX_IN6 _IOW('i', 103, struct in6_rrenumreq) /* add */ #define SIOCCIFPREFIX_IN6 _IOW('i', 104, \ struct in6_rrenumreq) /* change */ #define SIOCSGIFPREFIX_IN6 _IOW('i', 105, \ struct in6_rrenumreq) /* set global */ #define SIOCGETSGCNT_IN6 _IOWR('u', 106, \ struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ #define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) #define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address * (used only at first SIOC* call) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */ #define IN6_IFF_NOPFX 0x8000 /* skip kernel prefix management. * XXX: this should be temporary. */ /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) #ifdef _KERNEL #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b)) #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif #ifdef _KERNEL VNET_DECLARE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DECLARE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DECLARE(u_long, in6_ifaddrhmask); #define V_in6_ifaddrhead VNET(in6_ifaddrhead) #define V_in6_ifaddrhashtbl VNET(in6_ifaddrhashtbl) #define V_in6_ifaddrhmask VNET(in6_ifaddrhmask) #define IN6ADDR_NHASH_LOG2 8 #define IN6ADDR_NHASH (1 << IN6ADDR_NHASH_LOG2) #define IN6ADDR_HASHVAL(x) (in6_addrhash(x)) #define IN6ADDR_HASH(x) \ (&V_in6_ifaddrhashtbl[IN6ADDR_HASHVAL(x) & V_in6_ifaddrhmask]) static __inline uint32_t in6_addrhash(const struct in6_addr *in6) { uint32_t x; x = in6->s6_addr32[0] ^ in6->s6_addr32[1] ^ in6->s6_addr32[2] ^ in6->s6_addr32[3]; return (fnv_32_buf(&x, sizeof(x), FNV1_32_INIT)); } -extern struct rwlock in6_ifaddr_lock; -#define IN6_IFADDR_LOCK_ASSERT( ) rw_assert(&in6_ifaddr_lock, RA_LOCKED) -#define IN6_IFADDR_RLOCK() rw_rlock(&in6_ifaddr_lock) -#define IN6_IFADDR_RLOCK_ASSERT() rw_assert(&in6_ifaddr_lock, RA_RLOCKED) -#define IN6_IFADDR_RUNLOCK() rw_runlock(&in6_ifaddr_lock) -#define IN6_IFADDR_WLOCK() rw_wlock(&in6_ifaddr_lock) -#define IN6_IFADDR_WLOCK_ASSERT() rw_assert(&in6_ifaddr_lock, RA_WLOCKED) -#define IN6_IFADDR_WUNLOCK() rw_wunlock(&in6_ifaddr_lock) +extern struct rmlock in6_ifaddr_lock; +#define IN6_IFADDR_LOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_LOCKED) +#define IN6_IFADDR_RLOCK(t) rm_rlock(&in6_ifaddr_lock, (t)) +#define IN6_IFADDR_RLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_RLOCKED) +#define IN6_IFADDR_RUNLOCK(t) rm_runlock(&in6_ifaddr_lock, (t)) +#define IN6_IFADDR_WLOCK() rm_wlock(&in6_ifaddr_lock) +#define IN6_IFADDR_WLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_WLOCKED) +#define IN6_IFADDR_WUNLOCK() rm_wunlock(&in6_ifaddr_lock) #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->in6_ifstat[ \ offsetof(struct in6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) extern u_char inet6ctlerrmap[]; VNET_DECLARE(unsigned long, in6_maxmtu); #define V_in6_maxmtu VNET(in6_maxmtu) #endif /* _KERNEL */ /* * IPv6 multicast MLD-layer source entry. */ struct ip6_msource { RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ struct in6_addr im6s_addr; struct im6s_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } im6s_st[2]; /* state at t0, t1 */ uint8_t im6s_stp; /* pending query */ }; RB_HEAD(ip6_msource_tree, ip6_msource); /* * IPv6 multicast PCB-layer source entry. * * NOTE: overlapping use of struct ip6_msource fields at start. */ struct in6_msource { RB_ENTRY(ip6_msource) im6s_link; /* Common field */ struct in6_addr im6s_addr; /* Common field */ uint8_t im6sl_st[2]; /* state before/at commit */ }; #ifdef _KERNEL /* * IPv6 source tree comparison function. * * An ordered predicate is necessary; bcmp() is not documented to return * an indication of order, memcmp() is, and is an ISO C99 requirement. */ static __inline int ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) { return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); } RB_PROTOTYPE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * IPv6 multicast PCB-layer group filter descriptor. */ struct in6_mfilter { struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ u_long im6f_nsrc; /* # of source entries */ uint8_t im6f_st[2]; /* state before/at commit */ }; /* * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; LIST_ENTRY(in6_multi_mship) i6mm_chain; }; /* * IPv6 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When MLDv2 is active, in6m_timer is the response to group query timer. * The state-change timer in6m_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: in6m_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in6_multi { LIST_ENTRY(in6_multi) in6m_entry; /* list glue */ struct in6_addr in6m_addr; /* IPv6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ u_int in6m_refcount; /* reference count */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ /* New fields for MLDv2 follow. */ struct mld_ifsoftc *in6m_mli; /* MLD info */ SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ struct ip6_msource_tree in6m_srcs; /* tree of sources */ u_long in6m_nsrc; /* # of tree entries */ struct mbufq in6m_scq; /* queue of pending * state-change packets */ struct timeval in6m_lastgsrtv; /* last G-S-R query */ uint16_t in6m_sctimer; /* state-change timer */ uint16_t in6m_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing MLDv2 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct in6m_st { uint16_t iss_fmode; /* MLD filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } in6m_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, uint8_t t) { t = !!t; if (inm->in6m_st[t].iss_ex > 0 && inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) return (MCAST_EXCLUDE); else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } /* * Lock macros for IPv6 layer multicast address lists. IPv6 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in6_multi_mtx; #define IN6_MULTI_LOCK() mtx_lock(&in6_multi_mtx) #define IN6_MULTI_UNLOCK() mtx_unlock(&in6_multi_mtx) #define IN6_MULTI_LOCK_ASSERT() mtx_assert(&in6_multi_mtx, MA_OWNED) #define IN6_MULTI_UNLOCK_ASSERT() mtx_assert(&in6_multi_mtx, MA_NOTOWNED) /* * Look up an in6_multi record for an IPv6 multicast address * on the interface ifp. * If no record found, return NULL. * * SMPng: The IN6_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ static __inline struct in6_multi * in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct ifmultiaddr *ifma; struct in6_multi *inm; IN6_MULTI_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET6) { inm = (struct in6_multi *)ifma->ifma_protospec; if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) break; inm = NULL; } } return (inm); } /* * Wrapper for in6m_lookup_locked(). * * SMPng: Assumes that neithr the IN6_MULTI_LOCK() or IF_ADDR_LOCK() are held. */ static __inline struct in6_multi * in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct in6_multi *inm; IN6_MULTI_LOCK(); IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, mcaddr); IF_ADDR_RUNLOCK(ifp); IN6_MULTI_UNLOCK(); return (inm); } /* Acquire an in6_multi record. */ static __inline void in6m_acquire_locked(struct in6_multi *inm) { IN6_MULTI_LOCK_ASSERT(); ++inm->in6m_refcount; } struct ip6_moptions; struct sockopt; /* Multicast KPIs. */ int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); int in6_mc_join(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_mc_join_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_mc_leave(struct in6_multi *, struct in6_mfilter *); int in6_mc_leave_locked(struct in6_multi *, struct in6_mfilter *); void in6m_clear_recorded(struct in6_multi *); void in6m_commit(struct in6_multi *); void in6m_print(const struct in6_multi *); int in6m_record_source(struct in6_multi *, const struct in6_addr *); void in6m_release_locked(struct in6_multi *); void ip6_freemoptions(struct ip6_moptions *); int ip6_getmoptions(struct inpcb *, struct sockopt *); int ip6_setmoptions(struct inpcb *, struct sockopt *); /* Legacy KAME multicast KPIs. */ struct in6_multi_mship * in6_joingroup(struct ifnet *, struct in6_addr *, int *, int); int in6_leavegroup(struct in6_multi_mship *); /* flags to in6_update_ifa */ #define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ int in6_mask2len(struct in6_addr *, u_char *); int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); void in6_prepare_ifra(struct in6_aliasreq *, const struct in6_addr *, const struct in6_addr *); void in6_purgeaddr(struct ifaddr *); int in6if_do_dad(struct ifnet *); void in6_savemkludge(struct in6_ifaddr *); void *in6_domifattach(struct ifnet *); void in6_domifdetach(struct ifnet *, void *); int in6_domifmtu(struct ifnet *); void in6_setmaxmtu(void); int in6_if2idlen(struct ifnet *); struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, struct in6_addr *); struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t); struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *); int in6_addr2zoneid(struct ifnet *, struct in6_addr *, u_int32_t *); int in6_matchlen(struct in6_addr *, struct in6_addr *); int in6_are_prefix_equal(struct in6_addr *, struct in6_addr *, int); void in6_prefixlen2mask(struct in6_addr *, int); int in6_prefix_ioctl(struct socket *, u_long, caddr_t, struct ifnet *); int in6_prefix_add_ifid(int, struct in6_ifaddr *); void in6_prefix_remove_ifid(int, struct in6_ifaddr *); void in6_purgeprefix(struct ifnet *); int in6_is_addr_deprecated(struct sockaddr_in6 *); int in6_src_ioctl(u_long, caddr_t); void in6_newaddrmsg(struct in6_ifaddr *, int); /* * Extended API for IPv6 FIB support. */ void in6_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in6_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); void in6_rtalloc(struct route_in6 *, u_int); void in6_rtalloc_ign(struct route_in6 *, u_long, u_int); struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int); #endif /* _KERNEL */ #endif /* _NETINET6_IN6_VAR_H_ */ Index: head/sys/netinet6/ip6_input.c =================================================================== --- head/sys/netinet6/ip6_input.c (revision 286000) +++ head/sys/netinet6/ip6_input.c (revision 286001) @@ -1,1589 +1,1591 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, .nh_policy = NETISR_POLICY_FLOW, }; VNET_DECLARE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DEFINE(struct pfil_head, inet6_pfil_hook); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ -struct rwlock in6_ifaddr_lock; -RW_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); +struct rmlock in6_ifaddr_lock; +RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static void ip6_init2(void *); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { struct protosw *pr; int i; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); TAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip6_init"); /* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } netisr_register(&ip6_nh); } /* * The protocol to be inserted into ip6_protox[] must be already registered * in inet6sw[], either statically or through pf_proto_register(). */ int ip6proto_register(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] != pr - inet6sw) /* IPPROTO_RAW */ return (EEXIST); /* * Find the protocol position in inet6sw[] and set the index. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol == ip6proto) { ip6_protox[pr->pr_protocol] = pr - inet6sw; return (0); } } return (EPROTONOSUPPORT); } int ip6proto_unregister(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] == pr - inet6sw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip6_protox[ip6proto] = pr - inet6sw; return (0); } #ifdef VIMAGE void ip6_destroy() { int i; if ((i = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, i); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); nd6_destroy(); callout_drain(&V_in6_tmpaddrtimer_ch); } #endif static int ip6_init2_vnet(const void *unused __unused) { /* nd6_timer_init */ callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); /* timer for regeneranation of temporary addresses randomize ID */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); return (0); } static void ip6_init2(void *dummy) { ip6_init2_vnet(NULL); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); static int ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, &m, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); goto out; } #endif *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; #ifdef IPSEC /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ m->m_flags &= ~M_FASTFWD_OURS; ours = 1; ip6 = mtod(m, struct ip6_hdr *); goto hbhcheck; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { if (m->m_flags & M_LOOP) { IP6STAT_INC(ip6s_m2m[V_loif->if_index]); } else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX) IP6STAT_INC( ip6s_m2m[m->m_pkthdr.rcvif->if_index]); else IP6STAT_INC(ip6s_m2m[0]); } else IP6STAT_INC(ip6s_m1); } /* drop the packet if IPv6 operation is disabled on the IF */ if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; /* ENOBUFS */ } m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip6_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ odst = ip6->ip6_dst; /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passin; if (pfil_run_hooks(&V_inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } if ((m->m_flags & M_IP6_NEXTHOP) && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) || in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); ifa_free(&ia->ia_ifa); goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ifa_free(&ia->ia_ifa); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { IP6STAT_INC(ip6s_cantforward); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip6_ipsec_input(m, nxt)) goto bad; #endif /* IPSEC */ /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (nxt == IPPROTO_ICMPV6 && rtalert != ~0) m->m_flags |= M_RTALERT_MLD; nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); if (v4only) return; /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(in6p->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (elen > MLEN) n = m_getcl(M_NOWAIT, MT_DATA, 0); else n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) return NULL; m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. * XXX: This function supposes that * M includes all headers, * the next header field and the header length field of each header * are valid, and * the sum of each header length equals to OFF. * Because of these assumptions, this function must be called very * carefully. Moreover, it will not be used in the near future when * we develop `neater' mechanism to process extension headers. */ char * ip6_get_prevhdr(struct mbuf *m, int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) return (&ip6->ip6_nxt); else { int len, nxt; struct ip6_ext *ip6e = NULL; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); while (len < off) { ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len); switch (nxt) { case IPPROTO_FRAGMENT: len += sizeof(struct ip6_frag); break; case IPPROTO_AH: len += (ip6e->ip6e_len + 2) << 2; break; default: len += (ip6e->ip6e_len + 1) << 3; break; } nxt = ip6e->ip6e_nxt; } if (ip6e) return (&ip6e->ip6e_nxt); else return NULL; } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } return -1; } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT };