Index: head/sys/contrib/pf/net/if_pfsync.c =================================================================== --- head/sys/contrib/pf/net/if_pfsync.c (revision 171636) +++ head/sys/contrib/pf/net/if_pfsync.c (revision 171637) @@ -1,2329 +1,2329 @@ /* $OpenBSD: if_pfsync.c,v 1.73 2006/11/16 13:13:38 henning Exp $ */ /* * Copyright (c) 2002 Michael Shalayeff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef __FreeBSD__ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_carp.h" #include "opt_bpf.h" #include "opt_pf.h" #include __FBSDID("$FreeBSD$"); #ifdef DEV_BPF #define NBPFILTER DEV_BPF #else #define NBPFILTER 0 #endif #ifdef DEV_PFSYNC #define NPFSYNC DEV_PFSYNC #else #define NPFSYNC 0 #endif #ifdef DEV_CARP #define NCARP DEV_CARP #else #define NCARP 0 #endif #endif /* __FreeBSD__ */ #include #ifdef __FreeBSD__ #include #endif #include #include #include #include #include #ifdef __FreeBSD__ #include #include #include #include #include #include #include #include #else #include #include #endif #include #include #ifdef __FreeBSD__ #include #endif #include #include #include #include #include #include #include #ifdef INET #include #include #include #include #endif #ifdef INET6 #include #endif /* INET6 */ #ifndef __FreeBSD__ #include "carp.h" #endif #if NCARP > 0 #include #endif #include #include #ifndef __FreeBSD__ #include "bpfilter.h" #include "pfsync.h" #endif #define PFSYNC_MINMTU \ (sizeof(struct pfsync_header) + sizeof(struct pf_state)) #ifdef PFSYNCDEBUG #define DPRINTF(x) do { if (pfsyncdebug) printf x ; } while (0) int pfsyncdebug; #else #define DPRINTF(x) #endif struct pfsync_softc *pfsyncif = NULL; struct pfsyncstats pfsyncstats; #ifdef __FreeBSD__ SYSCTL_DECL(_net_inet_pfsync); SYSCTL_STRUCT(_net_inet_pfsync, 0, stats, CTLFLAG_RW, &pfsyncstats, pfsyncstats, "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); #endif void pfsyncattach(int); #ifdef __FreeBSD__ int pfsync_clone_create(struct if_clone *, int, caddr_t); void pfsync_clone_destroy(struct ifnet *); #else int pfsync_clone_create(struct if_clone *, int); int pfsync_clone_destroy(struct ifnet *); #endif void pfsync_setmtu(struct pfsync_softc *, int); int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, struct pf_state_peer *); int pfsync_insert_net_state(struct pfsync_state *, u_int8_t); #ifdef PFSYNC_TDB void pfsync_update_net_tdb(struct pfsync_tdb *); #endif int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); int pfsyncioctl(struct ifnet *, u_long, caddr_t); void pfsyncstart(struct ifnet *); struct mbuf *pfsync_get_mbuf(struct pfsync_softc *, u_int8_t, void **); int pfsync_request_update(struct pfsync_state_upd *, struct in_addr *); int pfsync_sendout(struct pfsync_softc *); #ifdef PFSYNC_TDB int pfsync_tdb_sendout(struct pfsync_softc *); #endif int pfsync_sendout_mbuf(struct pfsync_softc *, struct mbuf *); void pfsync_timeout(void *); #ifdef PFSYNC_TDB void pfsync_tdb_timeout(void *); #endif void pfsync_send_bus(struct pfsync_softc *, u_int8_t); void pfsync_bulk_update(void *); void pfsync_bulkfail(void *); #ifdef __FreeBSD__ void pfsync_ifdetach(void *, struct ifnet *); void pfsync_senddef(void *, int); /* XXX: ugly */ #define betoh64 (unsigned long long)be64toh #define timeout_del callout_stop #endif int pfsync_sync_ok; #ifndef __FreeBSD__ extern int ifqmaxlen; #endif #ifdef __FreeBSD__ IFC_SIMPLE_DECLARE(pfsync, 1); #else struct if_clone pfsync_cloner = IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy); #endif void pfsyncattach(int npfsync) { if_clone_attach(&pfsync_cloner); } int #ifdef __FreeBSD__ pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) #else pfsync_clone_create(struct if_clone *ifc, int unit) #endif { struct ifnet *ifp; if (unit != 0) return (EINVAL); pfsync_sync_ok = 1; if ((pfsyncif = malloc(sizeof(*pfsyncif), M_DEVBUF, M_NOWAIT)) == NULL) return (ENOMEM); bzero(pfsyncif, sizeof(*pfsyncif)); #ifdef __FreeBSD__ if ((pfsyncif->sc_imo.imo_membership = (struct in_multi **)malloc( (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_DEVBUF, M_NOWAIT)) == NULL) { free(pfsyncif, M_DEVBUF); return (ENOSPC); } pfsyncif->sc_imo.imo_mfilters = NULL; pfsyncif->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; pfsyncif->sc_imo.imo_multicast_vif = -1; ifp = pfsyncif->sc_ifp = if_alloc(IFT_PFSYNC); if (ifp == NULL) { free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); free(pfsyncif, M_DEVBUF); return (ENOSPC); } if_initname(ifp, ifc->ifc_name, unit); pfsyncif->sc_detachtag = EVENTHANDLER_REGISTER(ifnet_departure_event, pfsync_ifdetach, pfsyncif, EVENTHANDLER_PRI_ANY); if (pfsyncif->sc_detachtag == NULL) { if_free(ifp); free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); free(pfsyncif, M_DEVBUF); return (ENOSPC); } pfsyncif->sc_ifq.ifq_maxlen = ifqmaxlen; mtx_init(&pfsyncif->sc_ifq.ifq_mtx, ifp->if_xname, "pfsync send queue", MTX_DEF); TASK_INIT(&pfsyncif->sc_send_task, 0, pfsync_senddef, pfsyncif); #endif pfsyncif->sc_mbuf = NULL; pfsyncif->sc_mbuf_net = NULL; #ifdef PFSYNC_TDB pfsyncif->sc_mbuf_tdb = NULL; #endif pfsyncif->sc_statep.s = NULL; pfsyncif->sc_statep_net.s = NULL; #ifdef PFSYNC_TDB pfsyncif->sc_statep_tdb.t = NULL; #endif pfsyncif->sc_maxupdates = 128; #ifdef __FreeBSD__ pfsyncif->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); pfsyncif->sc_sendaddr.s_addr = htonl(INADDR_PFSYNC_GROUP); #else pfsyncif->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; pfsyncif->sc_sendaddr.s_addr = INADDR_PFSYNC_GROUP; #endif pfsyncif->sc_ureq_received = 0; pfsyncif->sc_ureq_sent = 0; pfsyncif->sc_bulk_send_next = NULL; pfsyncif->sc_bulk_terminator = NULL; #ifndef __FreeBSD__ ifp = &pfsyncif->sc_if; snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit); #endif ifp->if_softc = pfsyncif; ifp->if_ioctl = pfsyncioctl; ifp->if_output = pfsyncoutput; ifp->if_start = pfsyncstart; ifp->if_type = IFT_PFSYNC; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = PFSYNC_HDRLEN; pfsync_setmtu(pfsyncif, ETHERMTU); #ifdef __FreeBSD__ - callout_init(&pfsyncif->sc_tmo, NET_CALLOUT_MPSAFE); + callout_init(&pfsyncif->sc_tmo, CALLOUT_MPSAFE); #ifdef PFSYNC_TDB - callout_init(&pfsyncif->sc_tdb_tmo, NET_CALLOUT_MPSAFE); + callout_init(&pfsyncif->sc_tdb_tmo, CALLOUT_MPSAFE); #endif - callout_init(&pfsyncif->sc_bulk_tmo, NET_CALLOUT_MPSAFE); - callout_init(&pfsyncif->sc_bulkfail_tmo, NET_CALLOUT_MPSAFE); + callout_init(&pfsyncif->sc_bulk_tmo, CALLOUT_MPSAFE); + callout_init(&pfsyncif->sc_bulkfail_tmo, CALLOUT_MPSAFE); #else timeout_set(&pfsyncif->sc_tmo, pfsync_timeout, pfsyncif); timeout_set(&pfsyncif->sc_tdb_tmo, pfsync_tdb_timeout, pfsyncif); timeout_set(&pfsyncif->sc_bulk_tmo, pfsync_bulk_update, pfsyncif); timeout_set(&pfsyncif->sc_bulkfail_tmo, pfsync_bulkfail, pfsyncif); #endif if_attach(ifp); #ifndef __FreeBSD__ if_alloc_sadl(ifp); #endif #if NCARP > 0 if_addgroup(ifp, "carp"); #endif #if NBPFILTER > 0 #ifdef __FreeBSD__ bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); #else bpfattach(&pfsyncif->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN); #endif #endif return (0); } #ifdef __FreeBSD__ void #else int #endif pfsync_clone_destroy(struct ifnet *ifp) { #ifdef __FreeBSD__ EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfsyncif->sc_detachtag); callout_stop(&pfsyncif->sc_tmo); #ifdef PFSYNC_TDB callout_stop(&pfsyncif->sc_tdb_tmo); #endif callout_stop(&pfsyncif->sc_bulk_tmo); callout_stop(&pfsyncif->sc_bulkfail_tmo); /* XXX: more? */ #endif #if NBPFILTER > 0 bpfdetach(ifp); #endif if_detach(ifp); #ifdef __FreeBSD__ if_free(ifp); free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); #endif free(pfsyncif, M_DEVBUF); pfsyncif = NULL; #ifndef __FreeBSD__ return (0); #endif } /* * Start output on the pfsync interface. */ void pfsyncstart(struct ifnet *ifp) { struct mbuf *m; #ifndef __FreeBSD__ int s; #endif for (;;) { #ifdef __FreeBSD__ IF_LOCK(&ifp->if_snd); _IF_DROP(&ifp->if_snd); _IF_DEQUEUE(&ifp->if_snd, m); IF_UNLOCK(&ifp->if_snd); #else s = splnet(); IF_DROP(&ifp->if_snd); IF_DEQUEUE(&ifp->if_snd, m); splx(s); #endif if (m == NULL) return; else m_freem(m); } } int pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, struct pf_state_peer *d) { if (s->scrub.scrub_flag && d->scrub == NULL) { d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT); if (d->scrub == NULL) return (ENOMEM); bzero(d->scrub, sizeof(*d->scrub)); } return (0); } int pfsync_insert_net_state(struct pfsync_state *sp, u_int8_t chksum_flag) { struct pf_state *st = NULL; struct pf_rule *r = NULL; struct pfi_kif *kif; if (sp->creatorid == 0 && pf_status.debug >= PF_DEBUG_MISC) { printf("pfsync_insert_net_state: invalid creator id:" " %08x\n", ntohl(sp->creatorid)); return (EINVAL); } kif = pfi_kif_get(sp->ifname); if (kif == NULL) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert_net_state: " "unknown interface: %s\n", sp->ifname); /* skip this state */ return (0); } /* * If the ruleset checksums match, it's safe to associate the state * with the rule of that number. */ if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && chksum_flag) r = pf_main_ruleset.rules[ PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; else r = &pf_default_rule; if (!r->max_states || r->states < r->max_states) st = pool_get(&pf_state_pl, PR_NOWAIT); if (st == NULL) { pfi_kif_unref(kif, PFI_KIF_REF_NONE); return (ENOMEM); } bzero(st, sizeof(*st)); /* allocate memory for scrub info */ if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) { pfi_kif_unref(kif, PFI_KIF_REF_NONE); if (st->src.scrub) pool_put(&pf_state_scrub_pl, st->src.scrub); pool_put(&pf_state_pl, st); return (ENOMEM); } st->rule.ptr = r; /* XXX get pointers to nat_rule and anchor */ /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ r->states++; /* fill in the rest of the state entry */ pf_state_host_ntoh(&sp->lan, &st->lan); pf_state_host_ntoh(&sp->gwy, &st->gwy); pf_state_host_ntoh(&sp->ext, &st->ext); pf_state_peer_ntoh(&sp->src, &st->src); pf_state_peer_ntoh(&sp->dst, &st->dst); bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); st->creation = time_second - ntohl(sp->creation); st->expire = ntohl(sp->expire) + time_second; st->af = sp->af; st->proto = sp->proto; st->direction = sp->direction; st->log = sp->log; st->timeout = sp->timeout; st->allow_opts = sp->allow_opts; bcopy(sp->id, &st->id, sizeof(st->id)); st->creatorid = sp->creatorid; st->sync_flags = PFSTATE_FROMSYNC; if (pf_insert_state(kif, st)) { pfi_kif_unref(kif, PFI_KIF_REF_NONE); /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */ r->states--; if (st->dst.scrub) pool_put(&pf_state_scrub_pl, st->dst.scrub); if (st->src.scrub) pool_put(&pf_state_scrub_pl, st->src.scrub); pool_put(&pf_state_pl, st); return (EINVAL); } return (0); } void #ifdef __FreeBSD__ pfsync_input(struct mbuf *m, __unused int off) #else pfsync_input(struct mbuf *m, ...) #endif { struct ip *ip = mtod(m, struct ip *); struct pfsync_header *ph; struct pfsync_softc *sc = pfsyncif; struct pf_state *st; struct pf_state_cmp key; struct pfsync_state *sp; struct pfsync_state_upd *up; struct pfsync_state_del *dp; struct pfsync_state_clr *cp; struct pfsync_state_upd_req *rup; struct pfsync_state_bus *bus; #ifdef PFSYNC_TDB struct pfsync_tdb *pt; #endif struct in_addr src; struct mbuf *mp; int iplen, action, error, i, s, count, offp, sfail, stale = 0; u_int8_t chksum_flag = 0; pfsyncstats.pfsyncs_ipackets++; /* verify that we have a sync interface configured */ if (!sc || !sc->sc_sync_ifp || !pf_status.running) goto done; /* verify that the packet came in on the right interface */ if (sc->sc_sync_ifp != m->m_pkthdr.rcvif) { pfsyncstats.pfsyncs_badif++; goto done; } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != PFSYNC_DFLTTL) { pfsyncstats.pfsyncs_badttl++; goto done; } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ph)) { pfsyncstats.pfsyncs_hdrops++; goto done; } if (iplen + sizeof(*ph) > m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ph))) == NULL) { pfsyncstats.pfsyncs_hdrops++; goto done; } ip = mtod(m, struct ip *); } ph = (struct pfsync_header *)((char *)ip + iplen); /* verify the version */ if (ph->version != PFSYNC_VERSION) { pfsyncstats.pfsyncs_badver++; goto done; } action = ph->action; count = ph->count; /* make sure it's a valid action code */ if (action >= PFSYNC_ACT_MAX) { pfsyncstats.pfsyncs_badact++; goto done; } /* Cheaper to grab this now than having to mess with mbufs later */ src = ip->ip_src; if (!bcmp(&ph->pf_chksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) chksum_flag++; switch (action) { case PFSYNC_ACT_CLR: { struct pf_state *nexts; struct pfi_kif *kif; u_int32_t creatorid; if ((mp = m_pulldown(m, iplen + sizeof(*ph), sizeof(*cp), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } cp = (struct pfsync_state_clr *)(mp->m_data + offp); creatorid = cp->creatorid; s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif if (cp->ifname[0] == '\0') { for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) { nexts = RB_NEXT(pf_state_tree_id, &tree_id, st); if (st->creatorid == creatorid) { st->sync_flags |= PFSTATE_FROMSYNC; pf_unlink_state(st); } } } else { if ((kif = pfi_kif_get(cp->ifname)) == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); return; } for (st = RB_MIN(pf_state_tree_lan_ext, &kif->pfik_lan_ext); st; st = nexts) { nexts = RB_NEXT(pf_state_tree_lan_ext, &kif->pfik_lan_ext, st); if (st->creatorid == creatorid) { st->sync_flags |= PFSTATE_FROMSYNC; pf_unlink_state(st); } } } #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; } case PFSYNC_ACT_INS: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*sp), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); i < count; i++, sp++) { /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST || sp->direction > PF_OUT || (sp->af != AF_INET && sp->af != AF_INET6)) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: PFSYNC_ACT_INS: " "invalid value\n"); pfsyncstats.pfsyncs_badstate++; continue; } if ((error = pfsync_insert_net_state(sp, chksum_flag))) { if (error == ENOMEM) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); goto done; } continue; } } #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; case PFSYNC_ACT_UPD: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*sp), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); i < count; i++, sp++) { int flags = PFSYNC_FLAG_STALE; /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: PFSYNC_ACT_UPD: " "invalid value\n"); pfsyncstats.pfsyncs_badstate++; continue; } bcopy(sp->id, &key.id, sizeof(key.id)); key.creatorid = sp->creatorid; st = pf_find_state_byid(&key); if (st == NULL) { /* insert the update */ if (pfsync_insert_net_state(sp, chksum_flag)) pfsyncstats.pfsyncs_badstate++; continue; } sfail = 0; if (st->proto == IPPROTO_TCP) { /* * The state should never go backwards except * for syn-proxy states. Neither should the * sequence window slide backwards. */ if (st->src.state > sp->src.state && (st->src.state < PF_TCPS_PROXY_SRC || sp->src.state >= PF_TCPS_PROXY_SRC)) sfail = 1; else if (SEQ_GT(st->src.seqlo, ntohl(sp->src.seqlo))) sfail = 3; else if (st->dst.state > sp->dst.state) { /* There might still be useful * information about the src state here, * so import that part of the update, * then "fail" so we send the updated * state back to the peer who is missing * our what we know. */ pf_state_peer_ntoh(&sp->src, &st->src); /* XXX do anything with timeouts? */ sfail = 7; flags = 0; } else if (st->dst.state >= TCPS_SYN_SENT && SEQ_GT(st->dst.seqlo, ntohl(sp->dst.seqlo))) sfail = 4; } else { /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > sp->src.state) sfail = 5; else if (st->dst.state > sp->dst.state) sfail = 6; } if (sfail) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: %s stale update " "(%d) id: %016llx " "creatorid: %08x\n", (sfail < 7 ? "ignoring" : "partial"), sfail, betoh64(st->id), ntohl(st->creatorid)); pfsyncstats.pfsyncs_badstate++; if (!(sp->sync_flags & PFSTATE_STALE)) { /* we have a better state, send it */ if (sc->sc_mbuf != NULL && !stale) pfsync_sendout(sc); stale++; if (!st->sync_flags) pfsync_pack_state( PFSYNC_ACT_UPD, st, flags); } continue; } pfsync_alloc_scrub_memory(&sp->dst, &st->dst); pf_state_peer_ntoh(&sp->src, &st->src); pf_state_peer_ntoh(&sp->dst, &st->dst); st->expire = ntohl(sp->expire) + time_second; st->timeout = sp->timeout; } if (stale && sc->sc_mbuf != NULL) pfsync_sendout(sc); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; /* * It's not strictly necessary for us to support the "uncompressed" * delete action, but it's relatively simple and maintains consistency. */ case PFSYNC_ACT_DEL: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*sp), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); i < count; i++, sp++) { bcopy(sp->id, &key.id, sizeof(key.id)); key.creatorid = sp->creatorid; st = pf_find_state_byid(&key); if (st == NULL) { pfsyncstats.pfsyncs_badstate++; continue; } st->sync_flags |= PFSTATE_FROMSYNC; pf_unlink_state(st); } #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; case PFSYNC_ACT_UPD_C: { int update_requested = 0; if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*up), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, up = (struct pfsync_state_upd *)(mp->m_data + offp); i < count; i++, up++) { /* check for invalid values */ if (up->timeout >= PFTM_MAX || up->src.state > PF_TCPS_PROXY_DST || up->dst.state > PF_TCPS_PROXY_DST) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: " "PFSYNC_ACT_UPD_C: " "invalid value\n"); pfsyncstats.pfsyncs_badstate++; continue; } bcopy(up->id, &key.id, sizeof(key.id)); key.creatorid = up->creatorid; st = pf_find_state_byid(&key); if (st == NULL) { /* We don't have this state. Ask for it. */ error = pfsync_request_update(up, &src); if (error == ENOMEM) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); goto done; } update_requested = 1; pfsyncstats.pfsyncs_badstate++; continue; } sfail = 0; if (st->proto == IPPROTO_TCP) { /* * The state should never go backwards except * for syn-proxy states. Neither should the * sequence window slide backwards. */ if (st->src.state > up->src.state && (st->src.state < PF_TCPS_PROXY_SRC || up->src.state >= PF_TCPS_PROXY_SRC)) sfail = 1; else if (st->dst.state > up->dst.state) sfail = 2; else if (SEQ_GT(st->src.seqlo, ntohl(up->src.seqlo))) sfail = 3; else if (st->dst.state >= TCPS_SYN_SENT && SEQ_GT(st->dst.seqlo, ntohl(up->dst.seqlo))) sfail = 4; } else { /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > up->src.state) sfail = 5; else if (st->dst.state > up->dst.state) sfail = 6; } if (sfail) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: ignoring stale update " "(%d) id: %016llx " "creatorid: %08x\n", sfail, betoh64(st->id), ntohl(st->creatorid)); pfsyncstats.pfsyncs_badstate++; /* we have a better state, send it out */ if ((!stale || update_requested) && sc->sc_mbuf != NULL) { pfsync_sendout(sc); update_requested = 0; } stale++; if (!st->sync_flags) pfsync_pack_state(PFSYNC_ACT_UPD, st, PFSYNC_FLAG_STALE); continue; } pfsync_alloc_scrub_memory(&up->dst, &st->dst); pf_state_peer_ntoh(&up->src, &st->src); pf_state_peer_ntoh(&up->dst, &st->dst); st->expire = ntohl(up->expire) + time_second; st->timeout = up->timeout; } if ((update_requested || stale) && sc->sc_mbuf) pfsync_sendout(sc); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; } case PFSYNC_ACT_DEL_C: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*dp), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, dp = (struct pfsync_state_del *)(mp->m_data + offp); i < count; i++, dp++) { bcopy(dp->id, &key.id, sizeof(key.id)); key.creatorid = dp->creatorid; st = pf_find_state_byid(&key); if (st == NULL) { pfsyncstats.pfsyncs_badstate++; continue; } st->sync_flags |= PFSTATE_FROMSYNC; pf_unlink_state(st); } #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; case PFSYNC_ACT_INS_F: case PFSYNC_ACT_DEL_F: /* not implemented */ break; case PFSYNC_ACT_UREQ: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*rup), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif if (sc->sc_mbuf != NULL) pfsync_sendout(sc); for (i = 0, rup = (struct pfsync_state_upd_req *)(mp->m_data + offp); i < count; i++, rup++) { bcopy(rup->id, &key.id, sizeof(key.id)); key.creatorid = rup->creatorid; if (key.id == 0 && key.creatorid == 0) { sc->sc_ureq_received = time_uptime; if (sc->sc_bulk_send_next == NULL) sc->sc_bulk_send_next = TAILQ_FIRST(&state_list); sc->sc_bulk_terminator = sc->sc_bulk_send_next; if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received " "bulk update request\n"); pfsync_send_bus(sc, PFSYNC_BUS_START); #ifdef __FreeBSD__ callout_reset(&sc->sc_bulk_tmo, 1 * hz, pfsync_bulk_update, pfsyncif); #else timeout_add(&sc->sc_bulk_tmo, 1 * hz); #endif } else { st = pf_find_state_byid(&key); if (st == NULL) { pfsyncstats.pfsyncs_badstate++; continue; } if (!st->sync_flags) pfsync_pack_state(PFSYNC_ACT_UPD, st, 0); } } if (sc->sc_mbuf != NULL) pfsync_sendout(sc); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; case PFSYNC_ACT_BUS: /* If we're not waiting for a bulk update, who cares. */ if (sc->sc_ureq_sent == 0) break; if ((mp = m_pulldown(m, iplen + sizeof(*ph), sizeof(*bus), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } bus = (struct pfsync_state_bus *)(mp->m_data + offp); switch (bus->status) { case PFSYNC_BUS_START: #ifdef __FreeBSD__ callout_reset(&sc->sc_bulkfail_tmo, pf_pool_limits[PF_LIMIT_STATES].limit / (PFSYNC_BULKPACKETS * sc->sc_maxcount), pfsync_bulkfail, pfsyncif); #else timeout_add(&sc->sc_bulkfail_tmo, pf_pool_limits[PF_LIMIT_STATES].limit / (PFSYNC_BULKPACKETS * sc->sc_maxcount)); #endif if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received bulk " "update start\n"); break; case PFSYNC_BUS_END: if (time_uptime - ntohl(bus->endtime) >= sc->sc_ureq_sent) { /* that's it, we're happy */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; timeout_del(&sc->sc_bulkfail_tmo); #if NCARP > 0 if (!pfsync_sync_ok) #ifdef __FreeBSD__ #ifdef CARP_ADVANCED carp_group_demote_adj(sc->sc_ifp, -1); #endif #else carp_group_demote_adj(&sc->sc_if, -1); #endif #endif pfsync_sync_ok = 1; if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received valid " "bulk update end\n"); } else { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received invalid " "bulk update end: bad timestamp\n"); } break; } break; #ifdef PFSYNC_TDB case PFSYNC_ACT_TDB_UPD: if ((mp = m_pulldown(m, iplen + sizeof(*ph), count * sizeof(*pt), &offp)) == NULL) { pfsyncstats.pfsyncs_badlen++; return; } s = splsoftnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif for (i = 0, pt = (struct pfsync_tdb *)(mp->m_data + offp); i < count; i++, pt++) pfsync_update_net_tdb(pt); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; #endif } done: if (m) m_freem(m); } int pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { m_freem(m); return (0); } /* ARGSUSED */ int pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { #ifndef __FreeBSD__ struct proc *p = curproc; #endif struct pfsync_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct ip_moptions *imo = &sc->sc_imo; struct pfsyncreq pfsyncr; struct ifnet *sifp; int s, error; switch (cmd) { case SIOCSIFADDR: case SIOCAIFADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: #ifdef __FreeBSD__ if (ifp->if_flags & IFF_UP) ifp->if_drv_flags |= IFF_DRV_RUNNING; else ifp->if_drv_flags &= ~IFF_DRV_RUNNING; #else if (ifp->if_flags & IFF_UP) ifp->if_flags |= IFF_RUNNING; else ifp->if_flags &= ~IFF_RUNNING; #endif break; case SIOCSIFMTU: if (ifr->ifr_mtu < PFSYNC_MINMTU) return (EINVAL); if (ifr->ifr_mtu > MCLBYTES) ifr->ifr_mtu = MCLBYTES; s = splnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif if (ifr->ifr_mtu < ifp->if_mtu) pfsync_sendout(sc); pfsync_setmtu(sc, ifr->ifr_mtu); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; case SIOCGETPFSYNC: bzero(&pfsyncr, sizeof(pfsyncr)); if (sc->sc_sync_ifp) strlcpy(pfsyncr.pfsyncr_syncdev, sc->sc_sync_ifp->if_xname, IFNAMSIZ); pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; if ((error = copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)))) return (error); break; case SIOCSETPFSYNC: #ifdef __FreeBSD__ if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) #else if ((error = suser(p, p->p_acflag)) != 0) #endif return (error); if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) return (error); #ifdef __FreeBSD__ PF_LOCK(); #endif if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) #ifdef __FreeBSD__ sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); #else sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; #endif else sc->sc_sync_peer.s_addr = pfsyncr.pfsyncr_syncpeer.s_addr; if (pfsyncr.pfsyncr_maxupdates > 255) #ifdef __FreeBSD__ { PF_UNLOCK(); #endif return (EINVAL); #ifdef __FreeBSD__ } #endif sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; if (pfsyncr.pfsyncr_syncdev[0] == 0) { sc->sc_sync_ifp = NULL; if (sc->sc_mbuf_net != NULL) { /* Don't keep stale pfsync packets around. */ s = splnet(); m_freem(sc->sc_mbuf_net); sc->sc_mbuf_net = NULL; sc->sc_statep_net.s = NULL; splx(s); } #ifdef __FreeBSD__ PF_UNLOCK(); #endif if (imo->imo_num_memberships > 0) { in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); imo->imo_multicast_ifp = NULL; } break; } #ifdef __FreeBSD__ PF_UNLOCK(); #endif if ((sifp = ifunit(pfsyncr.pfsyncr_syncdev)) == NULL) return (EINVAL); #ifdef __FreeBSD__ PF_LOCK(); #endif s = splnet(); #ifdef __FreeBSD__ if (sifp->if_mtu < sc->sc_ifp->if_mtu || #else if (sifp->if_mtu < sc->sc_if.if_mtu || #endif (sc->sc_sync_ifp != NULL && sifp->if_mtu < sc->sc_sync_ifp->if_mtu) || sifp->if_mtu < MCLBYTES - sizeof(struct ip)) pfsync_sendout(sc); sc->sc_sync_ifp = sifp; #ifdef __FreeBSD__ pfsync_setmtu(sc, sc->sc_ifp->if_mtu); #else pfsync_setmtu(sc, sc->sc_if.if_mtu); #endif if (imo->imo_num_memberships > 0) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); #ifdef __FreeBSD__ PF_LOCK(); #endif imo->imo_multicast_ifp = NULL; } if (sc->sc_sync_ifp && #ifdef __FreeBSD__ sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { #else sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { #endif struct in_addr addr; if (!(sc->sc_sync_ifp->if_flags & IFF_MULTICAST)) { sc->sc_sync_ifp = NULL; #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); return (EADDRNOTAVAIL); } #ifdef __FreeBSD__ addr.s_addr = htonl(INADDR_PFSYNC_GROUP); #else addr.s_addr = INADDR_PFSYNC_GROUP; #endif #ifdef __FreeBSD__ PF_UNLOCK(); #endif if ((imo->imo_membership[0] = in_addmulti(&addr, sc->sc_sync_ifp)) == NULL) { sc->sc_sync_ifp = NULL; splx(s); return (ENOBUFS); } #ifdef __FreeBSD__ PF_LOCK(); #endif imo->imo_num_memberships++; imo->imo_multicast_ifp = sc->sc_sync_ifp; imo->imo_multicast_ttl = PFSYNC_DFLTTL; imo->imo_multicast_loop = 0; } if (sc->sc_sync_ifp || #ifdef __FreeBSD__ sc->sc_sendaddr.s_addr != htonl(INADDR_PFSYNC_GROUP)) { #else sc->sc_sendaddr.s_addr != INADDR_PFSYNC_GROUP) { #endif /* Request a full state table update. */ sc->sc_ureq_sent = time_uptime; #if NCARP > 0 if (pfsync_sync_ok) #ifdef __FreeBSD__ #ifdef CARP_ADVANCED carp_group_demote_adj(sc->sc_ifp, 1); #endif #else carp_group_demote_adj(&sc->sc_if, 1); #endif #endif pfsync_sync_ok = 0; if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: requesting bulk update\n"); #ifdef __FreeBSD__ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulkfail, pfsyncif); #else timeout_add(&sc->sc_bulkfail_tmo, 5 * hz); #endif error = pfsync_request_update(NULL, NULL); if (error == ENOMEM) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); return (ENOMEM); } pfsync_sendout(sc); } #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); break; default: return (ENOTTY); } return (0); } void pfsync_setmtu(struct pfsync_softc *sc, int mtu_req) { int mtu; if (sc->sc_sync_ifp && sc->sc_sync_ifp->if_mtu < mtu_req) mtu = sc->sc_sync_ifp->if_mtu; else mtu = mtu_req; sc->sc_maxcount = (mtu - sizeof(struct pfsync_header)) / sizeof(struct pfsync_state); if (sc->sc_maxcount > 254) sc->sc_maxcount = 254; #ifdef __FreeBSD__ sc->sc_ifp->if_mtu = sizeof(struct pfsync_header) + #else sc->sc_if.if_mtu = sizeof(struct pfsync_header) + #endif sc->sc_maxcount * sizeof(struct pfsync_state); } struct mbuf * pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp) { struct pfsync_header *h; struct mbuf *m; int len; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { #ifdef __FreeBSD__ sc->sc_ifp->if_oerrors++; #else sc->sc_if.if_oerrors++; #endif return (NULL); } switch (action) { case PFSYNC_ACT_CLR: len = sizeof(struct pfsync_header) + sizeof(struct pfsync_state_clr); break; case PFSYNC_ACT_UPD_C: len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd)) + sizeof(struct pfsync_header); break; case PFSYNC_ACT_DEL_C: len = (sc->sc_maxcount * sizeof(struct pfsync_state_del)) + sizeof(struct pfsync_header); break; case PFSYNC_ACT_UREQ: len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd_req)) + sizeof(struct pfsync_header); break; case PFSYNC_ACT_BUS: len = sizeof(struct pfsync_header) + sizeof(struct pfsync_state_bus); break; #ifdef PFSYNC_TDB case PFSYNC_ACT_TDB_UPD: len = (sc->sc_maxcount * sizeof(struct pfsync_tdb)) + sizeof(struct pfsync_header); break; #endif default: len = (sc->sc_maxcount * sizeof(struct pfsync_state)) + sizeof(struct pfsync_header); break; } if (len > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); #ifdef __FreeBSD__ sc->sc_ifp->if_oerrors++; #else sc->sc_if.if_oerrors++; #endif return (NULL); } m->m_data += (MCLBYTES - len) &~ (sizeof(long) - 1); } else MH_ALIGN(m, len); m->m_pkthdr.rcvif = NULL; m->m_pkthdr.len = m->m_len = sizeof(struct pfsync_header); h = mtod(m, struct pfsync_header *); h->version = PFSYNC_VERSION; h->af = 0; h->count = 0; h->action = action; #ifndef PFSYNC_TDB if (action != PFSYNC_ACT_TDB_UPD) #endif bcopy(&pf_status.pf_chksum, &h->pf_chksum, PF_MD5_DIGEST_LENGTH); *sp = (void *)((char *)h + PFSYNC_HDRLEN); #ifdef PFSYNC_TDB if (action == PFSYNC_ACT_TDB_UPD) #ifdef __FreeBSD__ callout_reset(&sc->sc_tdb_tmo, hz, pfsync_tdb_timeout, pfsyncif); #else timeout_add(&sc->sc_tdb_tmo, hz); #endif else #endif #ifdef __FreeBSD__ callout_reset(&sc->sc_tmo, hz, pfsync_timeout, pfsyncif); #else timeout_add(&sc->sc_tmo, hz); #endif return (m); } int pfsync_pack_state(u_int8_t action, struct pf_state *st, int flags) { struct ifnet *ifp = NULL; struct pfsync_softc *sc = pfsyncif; struct pfsync_header *h, *h_net; struct pfsync_state *sp = NULL; struct pfsync_state_upd *up = NULL; struct pfsync_state_del *dp = NULL; struct pf_rule *r; u_long secs; int s, ret = 0; u_int8_t i = 255, newaction = 0; if (sc == NULL) return (0); #ifdef __FreeBSD__ ifp = sc->sc_ifp; #else ifp = &sc->sc_if; #endif /* * If a packet falls in the forest and there's nobody around to * hear, does it make a sound? */ if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL && #ifdef __FreeBSD__ sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { #else sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { #endif /* Don't leave any stale pfsync packets hanging around. */ if (sc->sc_mbuf != NULL) { m_freem(sc->sc_mbuf); sc->sc_mbuf = NULL; sc->sc_statep.s = NULL; } return (0); } if (action >= PFSYNC_ACT_MAX) return (EINVAL); s = splnet(); #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif if (sc->sc_mbuf == NULL) { if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action, (void *)&sc->sc_statep.s)) == NULL) { splx(s); return (ENOMEM); } h = mtod(sc->sc_mbuf, struct pfsync_header *); } else { h = mtod(sc->sc_mbuf, struct pfsync_header *); if (h->action != action) { pfsync_sendout(sc); if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action, (void *)&sc->sc_statep.s)) == NULL) { splx(s); return (ENOMEM); } h = mtod(sc->sc_mbuf, struct pfsync_header *); } else { /* * If it's an update, look in the packet to see if * we already have an update for the state. */ if (action == PFSYNC_ACT_UPD && sc->sc_maxupdates) { struct pfsync_state *usp = (void *)((char *)h + PFSYNC_HDRLEN); for (i = 0; i < h->count; i++) { if (!memcmp(usp->id, &st->id, PFSYNC_ID_LEN) && usp->creatorid == st->creatorid) { sp = usp; sp->updates++; break; } usp++; } } } } secs = time_second; st->pfsync_time = time_uptime; if (sp == NULL) { /* not a "duplicate" update */ i = 255; sp = sc->sc_statep.s++; sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(struct pfsync_state); h->count++; bzero(sp, sizeof(*sp)); bcopy(&st->id, sp->id, sizeof(sp->id)); sp->creatorid = st->creatorid; strlcpy(sp->ifname, st->u.s.kif->pfik_name, sizeof(sp->ifname)); pf_state_host_hton(&st->lan, &sp->lan); pf_state_host_hton(&st->gwy, &sp->gwy); pf_state_host_hton(&st->ext, &sp->ext); bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr)); sp->creation = htonl(secs - st->creation); pf_state_counter_hton(st->packets[0], sp->packets[0]); pf_state_counter_hton(st->packets[1], sp->packets[1]); pf_state_counter_hton(st->bytes[0], sp->bytes[0]); pf_state_counter_hton(st->bytes[1], sp->bytes[1]); if ((r = st->rule.ptr) == NULL) sp->rule = htonl(-1); else sp->rule = htonl(r->nr); if ((r = st->anchor.ptr) == NULL) sp->anchor = htonl(-1); else sp->anchor = htonl(r->nr); sp->af = st->af; sp->proto = st->proto; sp->direction = st->direction; sp->log = st->log; sp->allow_opts = st->allow_opts; sp->timeout = st->timeout; if (flags & PFSYNC_FLAG_STALE) sp->sync_flags |= PFSTATE_STALE; } pf_state_peer_hton(&st->src, &sp->src); pf_state_peer_hton(&st->dst, &sp->dst); if (st->expire <= secs) sp->expire = htonl(0); else sp->expire = htonl(st->expire - secs); /* do we need to build "compressed" actions for network transfer? */ if (sc->sc_sync_ifp && flags & PFSYNC_FLAG_COMPRESS) { switch (action) { case PFSYNC_ACT_UPD: newaction = PFSYNC_ACT_UPD_C; break; case PFSYNC_ACT_DEL: newaction = PFSYNC_ACT_DEL_C; break; default: /* by default we just send the uncompressed states */ break; } } if (newaction) { if (sc->sc_mbuf_net == NULL) { if ((sc->sc_mbuf_net = pfsync_get_mbuf(sc, newaction, (void *)&sc->sc_statep_net.s)) == NULL) { splx(s); return (ENOMEM); } } h_net = mtod(sc->sc_mbuf_net, struct pfsync_header *); switch (newaction) { case PFSYNC_ACT_UPD_C: if (i != 255) { up = (void *)((char *)h_net + PFSYNC_HDRLEN + (i * sizeof(*up))); up->updates++; } else { h_net->count++; sc->sc_mbuf_net->m_pkthdr.len = sc->sc_mbuf_net->m_len += sizeof(*up); up = sc->sc_statep_net.u++; bzero(up, sizeof(*up)); bcopy(&st->id, up->id, sizeof(up->id)); up->creatorid = st->creatorid; } up->timeout = st->timeout; up->expire = sp->expire; up->src = sp->src; up->dst = sp->dst; break; case PFSYNC_ACT_DEL_C: sc->sc_mbuf_net->m_pkthdr.len = sc->sc_mbuf_net->m_len += sizeof(*dp); dp = sc->sc_statep_net.d++; h_net->count++; bzero(dp, sizeof(*dp)); bcopy(&st->id, dp->id, sizeof(dp->id)); dp->creatorid = st->creatorid; break; } } if (h->count == sc->sc_maxcount || (sc->sc_maxupdates && (sp->updates >= sc->sc_maxupdates))) ret = pfsync_sendout(sc); splx(s); return (ret); } /* This must be called in splnet() */ int pfsync_request_update(struct pfsync_state_upd *up, struct in_addr *src) { struct ifnet *ifp = NULL; struct pfsync_header *h; struct pfsync_softc *sc = pfsyncif; struct pfsync_state_upd_req *rup; int ret = 0; if (sc == NULL) return (0); #ifdef __FreeBSD__ ifp = sc->sc_ifp; #else ifp = &sc->sc_if; #endif if (sc->sc_mbuf == NULL) { if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ, (void *)&sc->sc_statep.s)) == NULL) return (ENOMEM); h = mtod(sc->sc_mbuf, struct pfsync_header *); } else { h = mtod(sc->sc_mbuf, struct pfsync_header *); if (h->action != PFSYNC_ACT_UREQ) { pfsync_sendout(sc); if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ, (void *)&sc->sc_statep.s)) == NULL) return (ENOMEM); h = mtod(sc->sc_mbuf, struct pfsync_header *); } } if (src != NULL) sc->sc_sendaddr = *src; sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*rup); h->count++; rup = sc->sc_statep.r++; bzero(rup, sizeof(*rup)); if (up != NULL) { bcopy(up->id, rup->id, sizeof(rup->id)); rup->creatorid = up->creatorid; } if (h->count == sc->sc_maxcount) ret = pfsync_sendout(sc); return (ret); } int pfsync_clear_states(u_int32_t creatorid, char *ifname) { struct ifnet *ifp = NULL; struct pfsync_softc *sc = pfsyncif; struct pfsync_state_clr *cp; int s, ret; if (sc == NULL) return (0); #ifdef __FreeBSD__ ifp = sc->sc_ifp; #else ifp = &sc->sc_if; #endif #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif s = splnet(); if (sc->sc_mbuf != NULL) pfsync_sendout(sc); if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_CLR, (void *)&sc->sc_statep.c)) == NULL) { splx(s); return (ENOMEM); } sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*cp); cp = sc->sc_statep.c; cp->creatorid = creatorid; if (ifname != NULL) strlcpy(cp->ifname, ifname, IFNAMSIZ); ret = (pfsync_sendout(sc)); splx(s); return (ret); } void pfsync_timeout(void *v) { struct pfsync_softc *sc = v; int s; s = splnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif pfsync_sendout(sc); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); } #ifdef PFSYNC_TDB void pfsync_tdb_timeout(void *v) { struct pfsync_softc *sc = v; int s; s = splnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif pfsync_tdb_sendout(sc); #ifdef __FreeBSD__ PF_UNLOCK(); #endif splx(s); } #endif /* This must be called in splnet() */ void pfsync_send_bus(struct pfsync_softc *sc, u_int8_t status) { struct pfsync_state_bus *bus; #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif if (sc->sc_mbuf != NULL) pfsync_sendout(sc); if (pfsync_sync_ok && (sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_BUS, (void *)&sc->sc_statep.b)) != NULL) { sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*bus); bus = sc->sc_statep.b; bus->creatorid = pf_status.hostid; bus->status = status; bus->endtime = htonl(time_uptime - sc->sc_ureq_received); pfsync_sendout(sc); } } void pfsync_bulk_update(void *v) { struct pfsync_softc *sc = v; int s, i = 0; struct pf_state *state; s = splnet(); #ifdef __FreeBSD__ PF_LOCK(); #endif if (sc->sc_mbuf != NULL) pfsync_sendout(sc); /* * Grab at most PFSYNC_BULKPACKETS worth of states which have not * been sent since the latest request was made. */ state = sc->sc_bulk_send_next; if (state) do { /* send state update if syncable and not already sent */ if (!state->sync_flags && state->timeout < PFTM_MAX && state->pfsync_time <= sc->sc_ureq_received) { pfsync_pack_state(PFSYNC_ACT_UPD, state, 0); i++; } /* figure next state to send */ state = TAILQ_NEXT(state, u.s.entry_list); /* wrap to start of list if we hit the end */ if (!state) state = TAILQ_FIRST(&state_list); } while (i < sc->sc_maxcount * PFSYNC_BULKPACKETS && state != sc->sc_bulk_terminator); if (!state || state == sc->sc_bulk_terminator) { /* we're done */ pfsync_send_bus(sc, PFSYNC_BUS_END); sc->sc_ureq_received = 0; sc->sc_bulk_send_next = NULL; sc->sc_bulk_terminator = NULL; timeout_del(&sc->sc_bulk_tmo); if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: bulk update complete\n"); } else { /* look again for more in a bit */ #ifdef __FreeBSD__ callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, pfsyncif); #else timeout_add(&sc->sc_bulk_tmo, 1); #endif sc->sc_bulk_send_next = state; } if (sc->sc_mbuf != NULL) pfsync_sendout(sc); splx(s); #ifdef __FreeBSD__ PF_UNLOCK(); #endif } void pfsync_bulkfail(void *v) { struct pfsync_softc *sc = v; int s, error; #ifdef __FreeBSD__ PF_LOCK(); #endif if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { /* Try again in a bit */ #ifdef __FreeBSD__ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulkfail, pfsyncif); #else timeout_add(&sc->sc_bulkfail_tmo, 5 * hz); #endif s = splnet(); error = pfsync_request_update(NULL, NULL); if (error == ENOMEM) { if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: cannot allocate mbufs for " "bulk update\n"); } else pfsync_sendout(sc); splx(s); } else { /* Pretend like the transfer was ok */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; #if NCARP > 0 if (!pfsync_sync_ok) #ifdef __FreeBSD__ #ifdef CARP_ADVANCED carp_group_demote_adj(sc->sc_ifp, -1); #endif #else carp_group_demote_adj(&sc->sc_if, -1); #endif #endif pfsync_sync_ok = 1; if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: failed to receive " "bulk update status\n"); timeout_del(&sc->sc_bulkfail_tmo); } #ifdef __FreeBSD__ PF_UNLOCK(); #endif } /* This must be called in splnet() */ int pfsync_sendout(struct pfsync_softc *sc) { #if NBPFILTER > 0 #ifdef __FreeBSD__ struct ifnet *ifp = sc->sc_ifp; #else struct ifnet *ifp = &sc->sc_if; #endif #endif struct mbuf *m; #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif timeout_del(&sc->sc_tmo); if (sc->sc_mbuf == NULL) return (0); m = sc->sc_mbuf; sc->sc_mbuf = NULL; sc->sc_statep.s = NULL; #if NBPFILTER > 0 if (ifp->if_bpf) #ifdef __FreeBSD__ BPF_MTAP(ifp, m); #else bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); #endif #endif if (sc->sc_mbuf_net) { m_freem(m); m = sc->sc_mbuf_net; sc->sc_mbuf_net = NULL; sc->sc_statep_net.s = NULL; } return pfsync_sendout_mbuf(sc, m); } #ifdef PFSYNC_TDB int pfsync_tdb_sendout(struct pfsync_softc *sc) { #if NBPFILTER > 0 #ifdef __FreeBSD__ struct ifnet *ifp = sc->sc_ifp; #else struct ifnet *ifp = &sc->sc_if; #endif #endif struct mbuf *m; #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif timeout_del(&sc->sc_tdb_tmo); if (sc->sc_mbuf_tdb == NULL) return (0); m = sc->sc_mbuf_tdb; sc->sc_mbuf_tdb = NULL; sc->sc_statep_tdb.t = NULL; #if NBPFILTER > 0 if (ifp->if_bpf) #ifdef __FreeBSD__ BPF_MTAP(ifp, m); #else bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); #endif #endif return pfsync_sendout_mbuf(sc, m); } #endif int pfsync_sendout_mbuf(struct pfsync_softc *sc, struct mbuf *m) { struct sockaddr sa; struct ip *ip; #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif if (sc->sc_sync_ifp || #ifdef __FreeBSD__ sc->sc_sync_peer.s_addr != htonl(INADDR_PFSYNC_GROUP)) { #else sc->sc_sync_peer.s_addr != INADDR_PFSYNC_GROUP) { #endif M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) { pfsyncstats.pfsyncs_onomem++; return (0); } ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; #ifdef __FreeBSD__ ip->ip_len = m->m_pkthdr.len; #else ip->ip_len = htons(m->m_pkthdr.len); #endif ip->ip_id = htons(ip_randomid()); #ifdef __FreeBSD__ ip->ip_off = IP_DF; #else ip->ip_off = htons(IP_DF); #endif ip->ip_ttl = PFSYNC_DFLTTL; ip->ip_p = IPPROTO_PFSYNC; ip->ip_sum = 0; bzero(&sa, sizeof(sa)); ip->ip_src.s_addr = INADDR_ANY; #ifdef __FreeBSD__ if (sc->sc_sendaddr.s_addr == htonl(INADDR_PFSYNC_GROUP)) #else if (sc->sc_sendaddr.s_addr == INADDR_PFSYNC_GROUP) #endif m->m_flags |= M_MCAST; ip->ip_dst = sc->sc_sendaddr; sc->sc_sendaddr.s_addr = sc->sc_sync_peer.s_addr; pfsyncstats.pfsyncs_opackets++; #ifdef __FreeBSD__ if (!IF_HANDOFF(&sc->sc_ifq, m, NULL)) pfsyncstats.pfsyncs_oerrors++; taskqueue_enqueue(taskqueue_thread, &pfsyncif->sc_send_task); #else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) pfsyncstats.pfsyncs_oerrors++; #endif } else m_freem(m); return (0); } #ifdef PFSYNC_TDB /* Update an in-kernel tdb. Silently fail if no tdb is found. */ void pfsync_update_net_tdb(struct pfsync_tdb *pt) { struct tdb *tdb; int s; /* check for invalid values */ if (ntohl(pt->spi) <= SPI_RESERVED_MAX || (pt->dst.sa.sa_family != AF_INET && pt->dst.sa.sa_family != AF_INET6)) goto bad; s = spltdb(); tdb = gettdb(pt->spi, &pt->dst, pt->sproto); if (tdb) { pt->rpl = ntohl(pt->rpl); pt->cur_bytes = betoh64(pt->cur_bytes); /* Neither replay nor byte counter should ever decrease. */ if (pt->rpl < tdb->tdb_rpl || pt->cur_bytes < tdb->tdb_cur_bytes) { splx(s); goto bad; } tdb->tdb_rpl = pt->rpl; tdb->tdb_cur_bytes = pt->cur_bytes; } splx(s); return; bad: if (pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " "invalid value\n"); pfsyncstats.pfsyncs_badstate++; return; } /* One of our local tdbs have been updated, need to sync rpl with others */ int pfsync_update_tdb(struct tdb *tdb, int output) { struct ifnet *ifp = NULL; struct pfsync_softc *sc = pfsyncif; struct pfsync_header *h; struct pfsync_tdb *pt = NULL; int s, i, ret; if (sc == NULL) return (0); #ifdef __FreeBSD__ ifp = sc->sc_ifp; #else ifp = &sc->sc_if; #endif if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL && #ifdef __FreeBSD__ sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { #else sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { #endif /* Don't leave any stale pfsync packets hanging around. */ if (sc->sc_mbuf_tdb != NULL) { m_freem(sc->sc_mbuf_tdb); sc->sc_mbuf_tdb = NULL; sc->sc_statep_tdb.t = NULL; } return (0); } #ifdef __FreeBSD__ PF_ASSERT(MA_OWNED); #endif s = splnet(); if (sc->sc_mbuf_tdb == NULL) { if ((sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, PFSYNC_ACT_TDB_UPD, (void *)&sc->sc_statep_tdb.t)) == NULL) { splx(s); return (ENOMEM); } h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); } else { h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); if (h->action != PFSYNC_ACT_TDB_UPD) { /* * XXX will never happen as long as there's * only one "TDB action". */ pfsync_tdb_sendout(sc); sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, PFSYNC_ACT_TDB_UPD, (void *)&sc->sc_statep_tdb.t); if (sc->sc_mbuf_tdb == NULL) { splx(s); return (ENOMEM); } h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); } else if (sc->sc_maxupdates) { /* * If it's an update, look in the packet to see if * we already have an update for the state. */ struct pfsync_tdb *u = (void *)((char *)h + PFSYNC_HDRLEN); for (i = 0; !pt && i < h->count; i++) { if (tdb->tdb_spi == u->spi && tdb->tdb_sproto == u->sproto && !bcmp(&tdb->tdb_dst, &u->dst, SA_LEN(&u->dst.sa))) { pt = u; pt->updates++; } u++; } } } if (pt == NULL) { /* not a "duplicate" update */ pt = sc->sc_statep_tdb.t++; sc->sc_mbuf_tdb->m_pkthdr.len = sc->sc_mbuf_tdb->m_len += sizeof(struct pfsync_tdb); h->count++; bzero(pt, sizeof(*pt)); pt->spi = tdb->tdb_spi; memcpy(&pt->dst, &tdb->tdb_dst, sizeof pt->dst); pt->sproto = tdb->tdb_sproto; } /* * When a failover happens, the master's rpl is probably above * what we see here (we may be up to a second late), so * increase it a bit for outbound tdbs to manage most such * situations. * * For now, just add an offset that is likely to be larger * than the number of packets we can see in one second. The RFC * just says the next packet must have a higher seq value. * * XXX What is a good algorithm for this? We could use * a rate-determined increase, but to know it, we would have * to extend struct tdb. * XXX pt->rpl can wrap over MAXINT, but if so the real tdb * will soon be replaced anyway. For now, just don't handle * this edge case. */ #define RPL_INCR 16384 pt->rpl = htonl(tdb->tdb_rpl + (output ? RPL_INCR : 0)); pt->cur_bytes = htobe64(tdb->tdb_cur_bytes); if (h->count == sc->sc_maxcount || (sc->sc_maxupdates && (pt->updates >= sc->sc_maxupdates))) ret = pfsync_tdb_sendout(sc); splx(s); return (ret); } #endif /* PFSYNC_TDB */ #ifdef __FreeBSD__ void pfsync_ifdetach(void *arg, struct ifnet *ifp) { struct pfsync_softc *sc = (struct pfsync_softc *)arg; struct ip_moptions *imo; if (sc == NULL || sc->sc_sync_ifp != ifp) return; /* not for us; unlocked read */ PF_LOCK(); /* Deal with a member interface going away from under us. */ sc->sc_sync_ifp = NULL; if (sc->sc_mbuf_net != NULL) { m_freem(sc->sc_mbuf_net); sc->sc_mbuf_net = NULL; sc->sc_statep_net.s = NULL; } imo = &sc->sc_imo; if (imo->imo_num_memberships > 0) { KASSERT(imo->imo_num_memberships == 1, ("%s: imo_num_memberships != 1", __func__)); /* * Our event handler is always called after protocol * domains have been detached from the underlying ifnet. * Do not call in_delmulti(); we held a single reference * which the protocol domain has purged in in_purgemaddrs(). */ PF_UNLOCK(); imo->imo_membership[--imo->imo_num_memberships] = NULL; PF_LOCK(); imo->imo_multicast_ifp = NULL; } PF_UNLOCK(); } void pfsync_senddef(void *arg, __unused int pending) { struct pfsync_softc *sc = (struct pfsync_softc *)arg; struct mbuf *m; for(;;) { IF_DEQUEUE(&sc->sc_ifq, m); if (m == NULL) break; /* Deal with a member interface going away from under us. */ if (sc->sc_sync_ifp == NULL) { pfsyncstats.pfsyncs_oerrors++; m_freem(m); continue; } if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) pfsyncstats.pfsyncs_oerrors++; } } static int pfsync_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: pfsyncattach(0); break; case MOD_UNLOAD: if_clone_detach(&pfsync_cloner); break; default: error = EINVAL; break; } return error; } static moduledata_t pfsync_mod = { "pfsync", pfsync_modevent, 0 }; #define PFSYNC_MODVER 1 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_VERSION(pfsync, PFSYNC_MODVER); MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER); #endif /* __FreeBSD__ */ Index: head/sys/net/bpf.c =================================================================== --- head/sys/net/bpf.c (revision 171636) +++ head/sys/net/bpf.c (revision 171637) @@ -1,1873 +1,1873 @@ /*- * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 * * $FreeBSD$ */ #include "opt_bpf.h" #include "opt_mac.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define M_SKIP_BPF M_SKIP_FIREWALL /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ static LIST_HEAD(, bpf_if) bpf_iflist; static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpf_allocbufs(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, int, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(const void *, void *, size_t), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static void bpf_drvinit(void *); static void bpf_clone(void *, struct ucred *, char *, int, struct cdev **); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); static int bpf_bufsize = 4096; SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, &bpf_bufsize, 0, "Default bpf buffer size"); static int bpf_maxbufsize = BPF_MAXBUFSIZE; SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, &bpf_maxbufsize, 0, "Maximum bpf buffer size"); static int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); static d_open_t bpfopen; static d_close_t bpfclose; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_close = bpfclose, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; static int bpf_movein(struct uio *uio, int linktype, int mtu, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) { const struct ieee80211_bpf_params *p; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len - hlen > mtu) return (EMSGSIZE); if ((unsigned)len > MCLBYTES) return (EIO); if (len > MHLEN) { m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); } else { MGETHDR(m, M_TRYWAIT, MT_DATA); } if (m == NULL) return (ENOBUFS); m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; *mp = m; if (m->m_len < hlen) { error = EPERM; goto bad; } error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(m->m_data, sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach file to the bpf interface, i.e. make d listen on bp. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so * it will divert packets to bpf. */ BPFIF_LOCK(bp); d->bd_bif = bp; LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); bpf_bpfd_cnt++; BPFIF_UNLOCK(bp); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { int error; struct bpf_if *bp; struct ifnet *ifp; bp = d->bd_bif; BPFIF_LOCK(bp); BPFD_LOCK(d); ifp = d->bd_bif->bif_ifp; /* * Remove d from the interface's descriptor list. */ LIST_REMOVE(d, bd_next); bpf_bpfd_cnt--; d->bd_bif = NULL; BPFD_UNLOCK(d); BPFIF_UNLOCK(bp); /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ if (d->bd_promisc) { d->bd_promisc = 0; error = ifpromisc(ifp, 0); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; mtx_lock(&bpf_mtx); d = dev->si_drv1; /* * Each minor can be opened by only one process. If the requested * minor is in use, return EBUSY. */ if (d != NULL) { mtx_unlock(&bpf_mtx); return (EBUSY); } dev->si_drv1 = (struct bpf_d *)~0; /* mark device in use */ mtx_unlock(&bpf_mtx); if ((dev->si_flags & SI_NAMED) == 0) make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600, "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); dev->si_drv1 = d; d->bd_bufsize = bpf_bufsize; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; #ifdef MAC mac_init_bpfdesc(d); mac_create_bpfdesc(td->td_ucred, d); #endif mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF); - callout_init(&d->bd_callout, NET_CALLOUT_MPSAFE); + callout_init(&d->bd_callout, CALLOUT_MPSAFE); knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL); return (0); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ /* ARGSUSED */ static int bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d = dev->si_drv1; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); mtx_lock(&bpf_mtx); if (d->bd_bif) bpf_detachd(d); mtx_unlock(&bpf_mtx); selwakeuppri(&d->bd_sel, PRINET); #ifdef MAC mac_destroy_bpfdesc(d); #endif /* MAC */ knlist_destroy(&d->bd_sel.si_note); bpf_freed(d); dev->si_drv1 = NULL; free(d, M_BPF); return (0); } /* * Rotate the packet buffers in descriptor d. Move the store buffer * into the hold slot, and the free buffer into the store slot. * Zero the length of the new store buffer. */ #define ROTATE_BUFFERS(d) \ (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ (d)->bd_sbuf = (d)->bd_fbuf; \ (d)->bd_slen = 0; \ (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; int timed_out; int error; /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if ((d->bd_immediate || timed_out) && d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. * Rotate the buffers and return what's here. */ ROTATE_BUFFERS(d); break; } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (ioflag & O_NONBLOCK) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_mtx, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. */ error = uiomove(d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } BPFD_UNLOCK(d); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; struct ifnet *ifp; struct mbuf *m, *mc; struct sockaddr dst; int error, hlen; if (d->bd_bif == NULL) return (ENXIO); ifp = d->bd_bif->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) return (ENETDOWN); if (uio->uio_resid == 0) return (0); bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m, &dst, &hlen, d->bd_wfilter); if (error) return (error); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_DONTWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* XXX Do not return the same packet twice. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_SKIP_BPF; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ #ifdef MAC BPFD_LOCK(d); mac_create_mbuf_from_bpfdesc(d, m); if (mc != NULL) mac_create_mbuf_from_bpfdesc(d, mc); BPFD_UNLOCK(d); #endif NET_LOCK_GIANT(); error = (*ifp->if_output)(ifp, m, &dst, NULL); NET_UNLOCK_GIANT(); if (mc != NULL) { if (error == 0) { NET_LOCK_GIANT(); (*ifp->if_input)(ifp, mc); NET_UNLOCK_GIANT(); } else m_freem(mc); } return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the * receive and drop counts. */ static void reset_d(struct bpf_d *d) { mtx_assert(&d->bd_mtx, MA_OWNED); if (d->bd_hbuf) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; } d->bd_slen = 0; d->bd_hlen = 0; d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; } /* * FIONREAD Check for read packet available. * SIOCGIFADDR Get interface address - convenient hook to driver. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set ethernet read filter. * BIOCSETWF Set ethernet write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d = dev->si_drv1; int error = 0; /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); d->bd_pid = td->td_proc->p_pid; if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: case BIOCGETIF: case BIOCGRTIMEOUT: case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: break; default: return (EPERM); } } switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } case SIOCGIFADDR: { struct ifnet *ifp; if (d->bd_bif == NULL) error = EINVAL; else { NET_LOCK_GIANT(); ifp = d->bd_bif->bif_ifp; error = (*ifp->if_ioctl)(ifp, cmd, addr); NET_UNLOCK_GIANT(); } break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: *(u_int *)addr = d->bd_bufsize; break; /* * Set buffer length. */ case BIOCSBLEN: if (d->bd_bif != NULL) error = EINVAL; else { u_int size = *(u_int *)addr; if (size > bpf_maxbufsize) *(u_int *)addr = size = bpf_maxbufsize; else if (size < BPF_MINBUFSIZE) *(u_int *)addr = size = BPF_MINBUFSIZE; d->bd_bufsize = size; } break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETWF: error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { NET_LOCK_GIANT(); error = ifpromisc(d->bd_bif->bif_ifp, 1); NET_UNLOCK_GIANT(); if (error == 0) d->bd_promisc = 1; } break; /* * Get current data link type. */ case BIOCGDLT: if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; break; /* * Get a list of supported data link types. */ case BIOCGDLTLIST: if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); break; /* * Set data link type. */ case BIOCSDLT: if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); break; /* * Get interface name. */ case BIOCGETIF: if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } break; /* * Set interface. */ case BIOCSETIF: error = bpf_setif(d, (struct ifreq *)addr); break; /* * Set read timeout. */ case BIOCSRTIMEOUT: { struct timeval *tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: { struct timeval *tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: d->bd_immediate = *(u_int *)addr; break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: *(u_int *)addr = d->bd_hdrcmplt; break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; break; /* * Get packet direction flag */ case BIOCGDIRECTION: *(u_int *)addr = d->bd_direction; break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: d->bd_direction = direction; break; default: error = EINVAL; } } break; case BIOCFEEDBACK: d->bd_feedback = *(u_int *)addr; break; case BIOCLOCK: d->bd_locked = 1; break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ d->bd_async = *(int *)addr; break; case FIOSETOWN: error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: *(int *)addr = fgetown(&d->bd_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else d->bd_sig = sig; break; } case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; } return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { struct bpf_insn *fcode, *old; u_int wfilter, flen, size; #ifdef BPF_JITTER bpf_jit_filter *ofunc; #endif if (cmd == BIOCSETWF) { old = d->bd_wfilter; wfilter = 1; #ifdef BPF_JITTER ofunc = NULL; #endif } else { wfilter = 0; old = d->bd_rfilter; #ifdef BPF_JITTER ofunc = d->bd_bfilter; #endif } if (fp->bf_insns == NULL) { if (fp->bf_len != 0) return (EINVAL); BPFD_LOCK(d); if (wfilter) d->bd_wfilter = NULL; else { d->bd_rfilter = NULL; #ifdef BPF_JITTER d->bd_bfilter = NULL; #endif } reset_d(d); BPFD_UNLOCK(d); if (old != NULL) free((caddr_t)old, M_BPF); #ifdef BPF_JITTER if (ofunc != NULL) bpf_destroy_jit_filter(ofunc); #endif return (0); } flen = fp->bf_len; if (flen > bpf_maxinsns) return (EINVAL); size = flen * sizeof(*fp->bf_insns); fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK); if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && bpf_validate(fcode, (int)flen)) { BPFD_LOCK(d); if (wfilter) d->bd_wfilter = fcode; else { d->bd_rfilter = fcode; #ifdef BPF_JITTER d->bd_bfilter = bpf_jitter(fcode, flen); #endif } reset_d(d); BPFD_UNLOCK(d); if (old != NULL) free((caddr_t)old, M_BPF); #ifdef BPF_JITTER if (ofunc != NULL) bpf_destroy_jit_filter(ofunc); #endif return (0); } free((caddr_t)fcode, M_BPF); return (EINVAL); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * Allocate the packet buffers if we need to. * If we're already attached to requested interface, * just flush the buffer. */ if (d->bd_sbuf == NULL) bpf_allocbufs(d); if (bp != d->bd_bif) { if (d->bd_bif) /* * Detach if attached to something else. */ bpf_detachd(d); bpf_attachd(d, bp); } BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; d = dev->si_drv1; if (d->bd_bif == NULL) return (ENXIO); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); d->bd_pid = td->td_proc->p_pid; if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d = (struct bpf_d *)dev->si_drv1; if (kn->kn_filter != EVFILT_READ) return (1); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; if (d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct bpf_d *d; u_int slen; int gottime; struct timeval tv; gottime = 0; BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { BPFD_LOCK(d); ++d->bd_rcount; #ifdef BPF_JITTER if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL) slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bcopy, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } /* * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in sys/uipc_mbuf.c. */ static void bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) { const struct mbuf *m; u_int count; u_char *dst; m = src_arg; dst = dst_arg; while (len > 0) { if (m == NULL) panic("bpf_mcopy"); count = min(m->m_len, len); bcopy(mtod(m, void *), dst, count); m = m->m_next; dst += count; len -= count; } } #define BPF_CHECK_DIRECTION(d, m) \ if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \ ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL)) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct bpf_d *d; u_int pktlen, slen; int gottime; struct timeval tv; if (m->m_flags & M_SKIP_BPF) { m->m_flags &= ~M_SKIP_BPF; return; } gottime = 0; pktlen = m_length(m, NULL); BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { BPF_CHECK_DIRECTION(d, m) continue; BPFD_LOCK(d); ++d->bd_rcount; #ifdef BPF_JITTER /* XXX We cannot handle multiple mbufs. */ if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL && m->m_next == NULL) slen = (*(d->bd_bfilter->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; struct timeval tv; if (m->m_flags & M_SKIP_BPF) { m->m_flags &= ~M_SKIP_BPF; return; } gottime = 0; pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { BPF_CHECK_DIRECTION(d, m) continue; BPFD_LOCK(d); ++d->bd_rcount; slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_mcopy, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } #undef BPF_CHECK_DIRECTION /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_mcopy is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(const void *, void *, size_t), struct timeval *tv) { struct bpf_hdr *hp; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. */ curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize) { /* * This packet will overflow the storage buffer. * Rotate the buffers if we can, then wakeup any * pending reads. */ if (d->bd_fbuf == NULL) { /* * We haven't completed the previous read yet, * so drop the packet. */ ++d->bd_dcount; return; } ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* * Immediate mode is set, or the read timeout has * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; /* * Append the bpf header. */ hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); hp->bh_tstamp = *tv; hp->bh_datalen = pktlen; hp->bh_hdrlen = hdrlen; /* * Copy the packet data into the store buffer and update its length. */ (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Initialize all nonzero fields of a descriptor. */ static void bpf_allocbufs(struct bpf_d *d) { KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL")); KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL")); KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL")); d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); d->bd_slen = 0; d->bd_hlen = 0; } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ if (d->bd_sbuf != NULL) { free(d->bd_sbuf, M_BPF); if (d->bd_hbuf != NULL) free(d->bd_hbuf, M_BPF); if (d->bd_fbuf != NULL) free(d->bd_fbuf, M_BPF); } if (d->bd_rfilter) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER bpf_destroy_jit_filter(d->bd_bfilter); #endif } if (d->bd_wfilter) free((caddr_t)d->bd_wfilter, M_BPF); mtx_destroy(&d->bd_mtx); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); if (bp == NULL) panic("bpfattach"); LIST_INIT(&bp->bif_dlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF); KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); *driverp = bp; mtx_lock(&bpf_mtx); LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); mtx_unlock(&bpf_mtx); /* * Compute the length of the bpf header. This is not necessarily * equal to SIZEOF_BPF_HDR because we want to insert spacing such * that the network layer header begins on a longword boundary (for * performance reasons and to alleviate alignment restrictions). */ bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; if (bootverbose) if_printf(ifp, "bpf attached\n"); } /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface, and leaving bd_bif NULL. Notify each * descriptor as it's detached so that any sleepers wake up and get * ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp; struct bpf_d *d; /* Locate BPF interface information */ mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (ifp == bp->bif_ifp) break; } /* Interface wasn't attached */ if ((bp == NULL) || (bp->bif_ifp == NULL)) { mtx_unlock(&bpf_mtx); printf("bpfdetach: %s was not attached\n", ifp->if_xname); return; } LIST_REMOVE(bp, bif_next); mtx_unlock(&bpf_mtx); while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd(d); BPFD_LOCK(d); bpf_wakeup(d); BPFD_UNLOCK(d); } mtx_destroy(&bp->bif_mtx); free(bp, M_BPF); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { int n, error; struct ifnet *ifp; struct bpf_if *bp; ifp = d->bd_bif->bif_ifp; n = 0; error = 0; mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { if (n >= bfl->bfl_len) { mtx_unlock(&bpf_mtx); return (ENOMEM); } error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); } n++; } mtx_unlock(&bpf_mtx); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } mtx_unlock(&bpf_mtx); if (bp != NULL) { opromisc = d->bd_promisc; bpf_detachd(d); bpf_attachd(d, bp); BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "bpf_setdlt: ifpromisc failed (%d)\n", error); else d->bd_promisc = 1; } } return (bp == NULL ? EINVAL : 0); } static void bpf_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { int u; if (*dev != NULL) return; if (dev_stdclone(name, NULL, "bpf", &u) != 1) return; *dev = make_dev(&bpf_cdevsw, unit2minor(u), UID_ROOT, GID_WHEEL, 0600, "bpf%d", u); dev_ref(*dev); (*dev)->si_flags |= SI_CHEAPCLONE; return; } static void bpf_drvinit(void *unused) { mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF); LIST_INIT(&bpf_iflist); EVENTHANDLER_REGISTER(dev_clone, bpf_clone, 0, 1000); } static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { bzero(d, sizeof(*d)); BPFD_LOCK_ASSERT(bd); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = bd->bd_rcount; d->bd_dcount = bd->bd_dcount; d->bd_fcount = bd->bd_fcount; d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; } static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { struct xbpf_d *xbdbuf, *xbd; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); mtx_lock(&bpf_mtx); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { mtx_unlock(&bpf_mtx); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; LIST_FOREACH(bp, &bpf_iflist, bif_next) { BPFIF_LOCK(bp); LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); BPFD_UNLOCK(bd); } BPFIF_UNLOCK(bp); } mtx_unlock(&bpf_mtx); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL) #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ static struct bpf_if bp_null; void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = &bp_null; } void bpfdetach(struct ifnet *ifp) { } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ Index: head/sys/netgraph/netgraph.h =================================================================== --- head/sys/netgraph/netgraph.h (revision 171636) +++ head/sys/netgraph/netgraph.h (revision 171637) @@ -1,1139 +1,1139 @@ /* * netgraph.h */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Julian Elischer * * $FreeBSD$ * $Whistle: netgraph.h,v 1.29 1999/11/01 07:56:13 julian Exp $ */ #ifndef _NETGRAPH_NETGRAPH_H_ #define _NETGRAPH_NETGRAPH_H_ #ifndef _KERNEL #error "This file should not be included in user level programs" #endif #include #include #include #include #include #include "opt_netgraph.h" /* debugging options */ #define NG_SEPARATE_MALLOC /* make modules use their own malloc types */ /* * This defines the in-kernel binary interface version. * It is possible to change this but leave the external message * API the same. Each type also has it's own cookies for versioning as well. * Change it for NETGRAPH_DEBUG version so we cannot mix debug and non debug * modules. */ #define _NG_ABI_VERSION 11 #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define NG_ABI_VERSION (_NG_ABI_VERSION + 0x10000) #else /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NG_ABI_VERSION _NG_ABI_VERSION #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ /* * Forward references for the basic structures so we can * define the typedefs and use them in the structures themselves. */ struct ng_hook ; struct ng_node ; struct ng_item ; typedef struct ng_item *item_p; typedef struct ng_node *node_p; typedef struct ng_hook *hook_p; /* node method definitions */ typedef int ng_constructor_t(node_p node); typedef int ng_close_t(node_p node); typedef int ng_shutdown_t(node_p node); typedef int ng_newhook_t(node_p node, hook_p hook, const char *name); typedef hook_p ng_findhook_t(node_p node, const char *name); typedef int ng_connect_t(hook_p hook); typedef int ng_rcvmsg_t(node_p node, item_p item, hook_p lasthook); typedef int ng_rcvdata_t(hook_p hook, item_p item); typedef int ng_disconnect_t(hook_p hook); typedef int ng_rcvitem (node_p node, hook_p hook, item_p item); /*********************************************************************** ***************** Hook Structure and Methods ************************** *********************************************************************** * * Structure of a hook */ struct ng_hook { char hk_name[NG_HOOKSIZ]; /* what this node knows this link as */ void *hk_private; /* node dependant ID for this hook */ int hk_flags; /* info about this hook/link */ int hk_refs; /* dont actually free this till 0 */ int hk_type; /* tbd: hook data link type */ struct ng_hook *hk_peer; /* the other end of this link */ struct ng_node *hk_node; /* The node this hook is attached to */ LIST_ENTRY(ng_hook) hk_hooks; /* linked list of all hooks on node */ ng_rcvmsg_t *hk_rcvmsg; /* control messages come here */ ng_rcvdata_t *hk_rcvdata; /* data comes here */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define HK_MAGIC 0x78573011 int hk_magic; char *lastfile; int lastline; SLIST_ENTRY(ng_hook) hk_all; /* all existing items */ #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ }; /* Flags for a hook */ #define HK_INVALID 0x0001 /* don't trust it! */ #define HK_QUEUE 0x0002 /* queue for later delivery */ #define HK_FORCE_WRITER 0x0004 /* Incoming data queued as a writer */ #define HK_DEAD 0x0008 /* This is the dead hook.. don't free */ /* * Public Methods for hook * If you can't do it with these you probably shouldn;t be doing it. */ void ng_unref_hook(hook_p hook); /* don't move this */ #define _NG_HOOK_REF(hook) atomic_add_int(&(hook)->hk_refs, 1) #define _NG_HOOK_NAME(hook) ((hook)->hk_name) #define _NG_HOOK_UNREF(hook) ng_unref_hook(hook) #define _NG_HOOK_SET_PRIVATE(hook, val) do {(hook)->hk_private = val;} while (0) #define _NG_HOOK_SET_RCVMSG(hook, val) do {(hook)->hk_rcvmsg = val;} while (0) #define _NG_HOOK_SET_RCVDATA(hook, val) do {(hook)->hk_rcvdata = val;} while (0) #define _NG_HOOK_PRIVATE(hook) ((hook)->hk_private) #define _NG_HOOK_NOT_VALID(hook) ((hook)->hk_flags & HK_INVALID) #define _NG_HOOK_IS_VALID(hook) (!((hook)->hk_flags & HK_INVALID)) #define _NG_HOOK_NODE(hook) ((hook)->hk_node) /* only rvalue! */ #define _NG_HOOK_PEER(hook) ((hook)->hk_peer) /* only rvalue! */ #define _NG_HOOK_FORCE_WRITER(hook) \ do { hook->hk_flags |= HK_FORCE_WRITER; } while (0) #define _NG_HOOK_FORCE_QUEUE(hook) do { hook->hk_flags |= HK_QUEUE; } while (0) /* Some shortcuts */ #define NG_PEER_NODE(hook) NG_HOOK_NODE(NG_HOOK_PEER(hook)) #define NG_PEER_HOOK_NAME(hook) NG_HOOK_NAME(NG_HOOK_PEER(hook)) #define NG_PEER_NODE_NAME(hook) NG_NODE_NAME(NG_PEER_NODE(hook)) #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define _NN_ __FILE__,__LINE__ void dumphook (hook_p hook, char *file, int line); static __inline void _chkhook(hook_p hook, char *file, int line); static __inline void _ng_hook_ref(hook_p hook, char * file, int line); static __inline char * _ng_hook_name(hook_p hook, char * file, int line); static __inline void _ng_hook_unref(hook_p hook, char * file, int line); static __inline void _ng_hook_set_private(hook_p hook, void * val, char * file, int line); static __inline void _ng_hook_set_rcvmsg(hook_p hook, ng_rcvmsg_t *val, char * file, int line); static __inline void _ng_hook_set_rcvdata(hook_p hook, ng_rcvdata_t *val, char * file, int line); static __inline void * _ng_hook_private(hook_p hook, char * file, int line); static __inline int _ng_hook_not_valid(hook_p hook, char * file, int line); static __inline int _ng_hook_is_valid(hook_p hook, char * file, int line); static __inline node_p _ng_hook_node(hook_p hook, char * file, int line); static __inline hook_p _ng_hook_peer(hook_p hook, char * file, int line); static __inline void _ng_hook_force_writer(hook_p hook, char * file, int line); static __inline void _ng_hook_force_queue(hook_p hook, char * file, int line); static __inline void _chkhook(hook_p hook, char *file, int line) { if (hook->hk_magic != HK_MAGIC) { printf("Accessing freed hook "); dumphook(hook, file, line); } hook->lastline = line; hook->lastfile = file; } static __inline void _ng_hook_ref(hook_p hook, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_REF(hook); } static __inline char * _ng_hook_name(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_NAME(hook)); } static __inline void _ng_hook_unref(hook_p hook, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_UNREF(hook); } static __inline void _ng_hook_set_private(hook_p hook, void *val, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_SET_PRIVATE(hook, val); } static __inline void _ng_hook_set_rcvmsg(hook_p hook, ng_rcvmsg_t *val, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_SET_RCVMSG(hook, val); } static __inline void _ng_hook_set_rcvdata(hook_p hook, ng_rcvdata_t *val, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_SET_RCVDATA(hook, val); } static __inline void * _ng_hook_private(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_PRIVATE(hook)); } static __inline int _ng_hook_not_valid(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_NOT_VALID(hook)); } static __inline int _ng_hook_is_valid(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_IS_VALID(hook)); } static __inline node_p _ng_hook_node(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_NODE(hook)); } static __inline hook_p _ng_hook_peer(hook_p hook, char * file, int line) { _chkhook(hook, file, line); return (_NG_HOOK_PEER(hook)); } static __inline void _ng_hook_force_writer(hook_p hook, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_FORCE_WRITER(hook); } static __inline void _ng_hook_force_queue(hook_p hook, char * file, int line) { _chkhook(hook, file, line); _NG_HOOK_FORCE_QUEUE(hook); } #define NG_HOOK_REF(hook) _ng_hook_ref(hook, _NN_) #define NG_HOOK_NAME(hook) _ng_hook_name(hook, _NN_) #define NG_HOOK_UNREF(hook) _ng_hook_unref(hook, _NN_) #define NG_HOOK_SET_PRIVATE(hook, val) _ng_hook_set_private(hook, val, _NN_) #define NG_HOOK_SET_RCVMSG(hook, val) _ng_hook_set_rcvmsg(hook, val, _NN_) #define NG_HOOK_SET_RCVDATA(hook, val) _ng_hook_set_rcvdata(hook, val, _NN_) #define NG_HOOK_PRIVATE(hook) _ng_hook_private(hook, _NN_) #define NG_HOOK_NOT_VALID(hook) _ng_hook_not_valid(hook, _NN_) #define NG_HOOK_IS_VALID(hook) _ng_hook_is_valid(hook, _NN_) #define NG_HOOK_NODE(hook) _ng_hook_node(hook, _NN_) #define NG_HOOK_PEER(hook) _ng_hook_peer(hook, _NN_) #define NG_HOOK_FORCE_WRITER(hook) _ng_hook_force_writer(hook, _NN_) #define NG_HOOK_FORCE_QUEUE(hook) _ng_hook_force_queue(hook, _NN_) #else /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NG_HOOK_REF(hook) _NG_HOOK_REF(hook) #define NG_HOOK_NAME(hook) _NG_HOOK_NAME(hook) #define NG_HOOK_UNREF(hook) _NG_HOOK_UNREF(hook) #define NG_HOOK_SET_PRIVATE(hook, val) _NG_HOOK_SET_PRIVATE(hook, val) #define NG_HOOK_SET_RCVMSG(hook, val) _NG_HOOK_SET_RCVMSG(hook, val) #define NG_HOOK_SET_RCVDATA(hook, val) _NG_HOOK_SET_RCVDATA(hook, val) #define NG_HOOK_PRIVATE(hook) _NG_HOOK_PRIVATE(hook) #define NG_HOOK_NOT_VALID(hook) _NG_HOOK_NOT_VALID(hook) #define NG_HOOK_IS_VALID(hook) _NG_HOOK_IS_VALID(hook) #define NG_HOOK_NODE(hook) _NG_HOOK_NODE(hook) #define NG_HOOK_PEER(hook) _NG_HOOK_PEER(hook) #define NG_HOOK_FORCE_WRITER(hook) _NG_HOOK_FORCE_WRITER(hook) #define NG_HOOK_FORCE_QUEUE(hook) _NG_HOOK_FORCE_QUEUE(hook) #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ /*********************************************************************** ***************** Node Structure and Methods ************************** *********************************************************************** * Structure of a node * including the eembedded queue structure. * * The structure for queueing Netgraph request items * embedded in the node structure */ struct ng_queue { u_long q_flags; struct mtx q_mtx; item_p queue; item_p *last; struct ng_node *q_node; /* find the front of the node.. */ }; struct ng_node { char nd_name[NG_NODESIZ]; /* optional globally unique name */ struct ng_type *nd_type; /* the installed 'type' */ int nd_flags; /* see below for bit definitions */ int nd_refs; /* # of references to this node */ int nd_numhooks; /* number of hooks */ void *nd_private; /* node type dependant node ID */ ng_ID_t nd_ID; /* Unique per node */ LIST_HEAD(hooks, ng_hook) nd_hooks; /* linked list of node hooks */ LIST_ENTRY(ng_node) nd_nodes; /* linked list of all nodes */ LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */ TAILQ_ENTRY(ng_node) nd_work; /* nodes with work to do */ struct ng_queue nd_input_queue; /* input queue for locking */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define ND_MAGIC 0x59264837 int nd_magic; char *lastfile; int lastline; SLIST_ENTRY(ng_node) nd_all; /* all existing nodes */ #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ }; /* Flags for a node */ #define NGF_INVALID 0x00000001 /* free when refs go to 0 */ #define NG_INVALID NGF_INVALID /* compat for old code */ #define NGF_WORKQ 0x00000002 /* node is on the work queue */ #define NG_WORKQ NGF_WORKQ /* compat for old code */ #define NGF_FORCE_WRITER 0x00000004 /* Never multithread this node */ #define NG_FORCE_WRITER NGF_FORCE_WRITER /* compat for old code */ #define NGF_CLOSING 0x00000008 /* ng_rmnode() at work */ #define NG_CLOSING NGF_CLOSING /* compat for old code */ #define NGF_REALLY_DIE 0x00000010 /* "persistent" node is unloading */ #define NG_REALLY_DIE NGF_REALLY_DIE /* compat for old code */ #define NGF_TYPE1 0x10000000 /* reserved for type specific storage */ #define NGF_TYPE2 0x20000000 /* reserved for type specific storage */ #define NGF_TYPE3 0x40000000 /* reserved for type specific storage */ #define NGF_TYPE4 0x80000000 /* reserved for type specific storage */ /* * Public methods for nodes. * If you can't do it with these you probably shouldn't be doing it. */ int ng_unref_node(node_p node); /* don't move this */ #define _NG_NODE_NAME(node) ((node)->nd_name + 0) #define _NG_NODE_HAS_NAME(node) ((node)->nd_name[0] + 0) #define _NG_NODE_ID(node) ((node)->nd_ID + 0) #define _NG_NODE_REF(node) atomic_add_int(&(node)->nd_refs, 1) #define _NG_NODE_UNREF(node) ng_unref_node(node) #define _NG_NODE_SET_PRIVATE(node, val) do {(node)->nd_private = val;} while (0) #define _NG_NODE_PRIVATE(node) ((node)->nd_private) #define _NG_NODE_IS_VALID(node) (!((node)->nd_flags & NGF_INVALID)) #define _NG_NODE_NOT_VALID(node) ((node)->nd_flags & NGF_INVALID) #define _NG_NODE_NUMHOOKS(node) ((node)->nd_numhooks + 0) /* rvalue */ #define _NG_NODE_FORCE_WRITER(node) \ do{ node->nd_flags |= NGF_FORCE_WRITER; }while (0) #define _NG_NODE_REALLY_DIE(node) \ do{ node->nd_flags |= (NGF_REALLY_DIE|NGF_INVALID); }while (0) #define _NG_NODE_REVIVE(node) \ do { node->nd_flags &= ~NGF_INVALID; } while (0) /* * The hook iterator. * This macro will call a function of type ng_fn_eachhook for each * hook attached to the node. If the function returns 0, then the * iterator will stop and return a pointer to the hook that returned 0. */ typedef int ng_fn_eachhook(hook_p hook, void* arg); #define _NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \ do { \ hook_p _hook; \ (rethook) = NULL; \ LIST_FOREACH(_hook, &((node)->nd_hooks), hk_hooks) { \ if ((fn)(_hook, arg) == 0) { \ (rethook) = _hook; \ break; \ } \ } \ } while (0) #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ void dumpnode(node_p node, char *file, int line); static __inline void _chknode(node_p node, char *file, int line); static __inline char * _ng_node_name(node_p node, char *file, int line); static __inline int _ng_node_has_name(node_p node, char *file, int line); static __inline ng_ID_t _ng_node_id(node_p node, char *file, int line); static __inline void _ng_node_ref(node_p node, char *file, int line); static __inline int _ng_node_unref(node_p node, char *file, int line); static __inline void _ng_node_set_private(node_p node, void * val, char *file, int line); static __inline void * _ng_node_private(node_p node, char *file, int line); static __inline int _ng_node_is_valid(node_p node, char *file, int line); static __inline int _ng_node_not_valid(node_p node, char *file, int line); static __inline int _ng_node_numhooks(node_p node, char *file, int line); static __inline void _ng_node_force_writer(node_p node, char *file, int line); static __inline hook_p _ng_node_foreach_hook(node_p node, ng_fn_eachhook *fn, void *arg, char *file, int line); static __inline void _ng_node_revive(node_p node, char *file, int line); static __inline void _chknode(node_p node, char *file, int line) { if (node->nd_magic != ND_MAGIC) { printf("Accessing freed node "); dumpnode(node, file, line); } node->lastline = line; node->lastfile = file; } static __inline char * _ng_node_name(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_NAME(node)); } static __inline int _ng_node_has_name(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_HAS_NAME(node)); } static __inline ng_ID_t _ng_node_id(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_ID(node)); } static __inline void _ng_node_ref(node_p node, char *file, int line) { _chknode(node, file, line); _NG_NODE_REF(node); } static __inline int _ng_node_unref(node_p node, char *file, int line) { _chknode(node, file, line); return (_NG_NODE_UNREF(node)); } static __inline void _ng_node_set_private(node_p node, void * val, char *file, int line) { _chknode(node, file, line); _NG_NODE_SET_PRIVATE(node, val); } static __inline void * _ng_node_private(node_p node, char *file, int line) { _chknode(node, file, line); return (_NG_NODE_PRIVATE(node)); } static __inline int _ng_node_is_valid(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_IS_VALID(node)); } static __inline int _ng_node_not_valid(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_NOT_VALID(node)); } static __inline int _ng_node_numhooks(node_p node, char *file, int line) { _chknode(node, file, line); return(_NG_NODE_NUMHOOKS(node)); } static __inline void _ng_node_force_writer(node_p node, char *file, int line) { _chknode(node, file, line); _NG_NODE_FORCE_WRITER(node); } static __inline void _ng_node_really_die(node_p node, char *file, int line) { _chknode(node, file, line); _NG_NODE_REALLY_DIE(node); } static __inline void _ng_node_revive(node_p node, char *file, int line) { _chknode(node, file, line); _NG_NODE_REVIVE(node); } static __inline hook_p _ng_node_foreach_hook(node_p node, ng_fn_eachhook *fn, void *arg, char *file, int line) { hook_p hook; _chknode(node, file, line); _NG_NODE_FOREACH_HOOK(node, fn, arg, hook); return (hook); } #define NG_NODE_NAME(node) _ng_node_name(node, _NN_) #define NG_NODE_HAS_NAME(node) _ng_node_has_name(node, _NN_) #define NG_NODE_ID(node) _ng_node_id(node, _NN_) #define NG_NODE_REF(node) _ng_node_ref(node, _NN_) #define NG_NODE_UNREF(node) _ng_node_unref(node, _NN_) #define NG_NODE_SET_PRIVATE(node, val) _ng_node_set_private(node, val, _NN_) #define NG_NODE_PRIVATE(node) _ng_node_private(node, _NN_) #define NG_NODE_IS_VALID(node) _ng_node_is_valid(node, _NN_) #define NG_NODE_NOT_VALID(node) _ng_node_not_valid(node, _NN_) #define NG_NODE_FORCE_WRITER(node) _ng_node_force_writer(node, _NN_) #define NG_NODE_REALLY_DIE(node) _ng_node_really_die(node, _NN_) #define NG_NODE_NUMHOOKS(node) _ng_node_numhooks(node, _NN_) #define NG_NODE_REVIVE(node) _ng_node_revive(node, _NN_) #define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \ do { \ rethook = _ng_node_foreach_hook(node, fn, (void *)arg, _NN_); \ } while (0) #else /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NG_NODE_NAME(node) _NG_NODE_NAME(node) #define NG_NODE_HAS_NAME(node) _NG_NODE_HAS_NAME(node) #define NG_NODE_ID(node) _NG_NODE_ID(node) #define NG_NODE_REF(node) _NG_NODE_REF(node) #define NG_NODE_UNREF(node) _NG_NODE_UNREF(node) #define NG_NODE_SET_PRIVATE(node, val) _NG_NODE_SET_PRIVATE(node, val) #define NG_NODE_PRIVATE(node) _NG_NODE_PRIVATE(node) #define NG_NODE_IS_VALID(node) _NG_NODE_IS_VALID(node) #define NG_NODE_NOT_VALID(node) _NG_NODE_NOT_VALID(node) #define NG_NODE_FORCE_WRITER(node) _NG_NODE_FORCE_WRITER(node) #define NG_NODE_REALLY_DIE(node) _NG_NODE_REALLY_DIE(node) #define NG_NODE_NUMHOOKS(node) _NG_NODE_NUMHOOKS(node) #define NG_NODE_REVIVE(node) _NG_NODE_REVIVE(node) #define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \ _NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ /*********************************************************************** ************* Node Queue and Item Structures and Methods ************** *********************************************************************** * */ typedef void ng_item_fn(node_p node, hook_p hook, void *arg1, int arg2); typedef void ng_apply_t(void *context, int error); struct ng_item { u_long el_flags; item_p el_next; node_p el_dest; /* The node it will be applied against (or NULL) */ hook_p el_hook; /* Entering hook. Optional in Control messages */ union { struct mbuf *da_m; struct { struct ng_mesg *msg_msg; ng_ID_t msg_retaddr; } msg; struct { ng_item_fn *fn_fn; void *fn_arg1; int fn_arg2; } fn; } body; /* * Optional callback called when item is being applied, * and its context. */ ng_apply_t *apply; void *context; #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ char *lastfile; int lastline; TAILQ_ENTRY(ng_item) all; /* all existing items */ #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ }; #define NGQF_TYPE 0x03 /* MASK of content definition */ #define NGQF_MESG 0x00 /* the queue element is a message */ #define NGQF_DATA 0x01 /* the queue element is data */ #define NGQF_FN 0x02 /* the queue element is a function */ #define NGQF_UNDEF 0x03 /* UNDEFINED */ #define NGQF_RW 0x04 /* MASK for wanted queue mode */ #define NGQF_READER 0x04 /* wants to be a reader */ #define NGQF_WRITER 0x00 /* wants to be a writer */ #define NGQF_QMODE 0x08 /* MASK for how it was queued */ #define NGQF_QREADER 0x08 /* was queued as a reader */ #define NGQF_QWRITER 0x00 /* was queued as a writer */ /* * Get the mbuf (etc) out of an item. * Sets the value in the item to NULL in case we need to call NG_FREE_ITEM() * with it, (to avoid freeing the things twice). * If you don't want to zero out the item then realise that the * item still owns it. * Retaddr is different. There are no references on that. It's just a number. * The debug versions must be either all used everywhere or not at all. */ #define _NGI_M(i) ((i)->body.da_m) #define _NGI_MSG(i) ((i)->body.msg.msg_msg) #define _NGI_RETADDR(i) ((i)->body.msg.msg_retaddr) #define _NGI_FN(i) ((i)->body.fn.fn_fn) #define _NGI_ARG1(i) ((i)->body.fn.fn_arg1) #define _NGI_ARG2(i) ((i)->body.fn.fn_arg2) #define _NGI_NODE(i) ((i)->el_dest) #define _NGI_HOOK(i) ((i)->el_hook) #define _NGI_SET_HOOK(i,h) do { _NGI_HOOK(i) = h; h = NULL;} while (0) #define _NGI_CLR_HOOK(i) do { \ hook_p _hook = _NGI_HOOK(i); \ if (_hook) { \ _NG_HOOK_UNREF(_hook); \ _NGI_HOOK(i) = NULL; \ } \ } while (0) #define _NGI_SET_NODE(i,n) do { _NGI_NODE(i) = n; n = NULL;} while (0) #define _NGI_CLR_NODE(i) do { \ node_p _node = _NGI_NODE(i); \ if (_node) { \ _NG_NODE_UNREF(_node); \ _NGI_NODE(i) = NULL; \ } \ } while (0) #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ void dumpitem(item_p item, char *file, int line); static __inline void _ngi_check(item_p item, char *file, int line) ; static __inline struct mbuf ** _ngi_m(item_p item, char *file, int line) ; static __inline ng_ID_t * _ngi_retaddr(item_p item, char *file, int line); static __inline struct ng_mesg ** _ngi_msg(item_p item, char *file, int line) ; static __inline ng_item_fn ** _ngi_fn(item_p item, char *file, int line) ; static __inline void ** _ngi_arg1(item_p item, char *file, int line) ; static __inline int * _ngi_arg2(item_p item, char *file, int line) ; static __inline node_p _ngi_node(item_p item, char *file, int line); static __inline hook_p _ngi_hook(item_p item, char *file, int line); static __inline void _ngi_check(item_p item, char *file, int line) { (item)->lastline = line; (item)->lastfile = file; } static __inline struct mbuf ** _ngi_m(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_M(item)); } static __inline struct ng_mesg ** _ngi_msg(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_MSG(item)); } static __inline ng_ID_t * _ngi_retaddr(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_RETADDR(item)); } static __inline ng_item_fn ** _ngi_fn(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_FN(item)); } static __inline void ** _ngi_arg1(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_ARG1(item)); } static __inline int * _ngi_arg2(item_p item, char *file, int line) { _ngi_check(item, file, line); return (&_NGI_ARG2(item)); } static __inline node_p _ngi_node(item_p item, char *file, int line) { _ngi_check(item, file, line); return (_NGI_NODE(item)); } static __inline hook_p _ngi_hook(item_p item, char *file, int line) { _ngi_check(item, file, line); return (_NGI_HOOK(item)); } #define NGI_M(i) (*_ngi_m(i, _NN_)) #define NGI_MSG(i) (*_ngi_msg(i, _NN_)) #define NGI_RETADDR(i) (*_ngi_retaddr(i, _NN_)) #define NGI_FN(i) (*_ngi_fn(i, _NN_)) #define NGI_ARG1(i) (*_ngi_arg1(i, _NN_)) #define NGI_ARG2(i) (*_ngi_arg2(i, _NN_)) #define NGI_HOOK(i) _ngi_hook(i, _NN_) #define NGI_NODE(i) _ngi_node(i, _NN_) #define NGI_SET_HOOK(i,h) \ do { _ngi_check(i, _NN_); _NGI_SET_HOOK(i, h); } while (0) #define NGI_CLR_HOOK(i) \ do { _ngi_check(i, _NN_); _NGI_CLR_HOOK(i); } while (0) #define NGI_SET_NODE(i,n) \ do { _ngi_check(i, _NN_); _NGI_SET_NODE(i, n); } while (0) #define NGI_CLR_NODE(i) \ do { _ngi_check(i, _NN_); _NGI_CLR_NODE(i); } while (0) #define NG_FREE_ITEM(item) \ do { \ _ngi_check(item, _NN_); \ ng_free_item((item)); \ } while (0) #define SAVE_LINE(item) \ do { \ (item)->lastline = __LINE__; \ (item)->lastfile = __FILE__; \ } while (0) #else /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NGI_M(i) _NGI_M(i) #define NGI_MSG(i) _NGI_MSG(i) #define NGI_RETADDR(i) _NGI_RETADDR(i) #define NGI_FN(i) _NGI_FN(i) #define NGI_ARG1(i) _NGI_ARG1(i) #define NGI_ARG2(i) _NGI_ARG2(i) #define NGI_NODE(i) _NGI_NODE(i) #define NGI_HOOK(i) _NGI_HOOK(i) #define NGI_SET_HOOK(i,h) _NGI_SET_HOOK(i,h) #define NGI_CLR_HOOK(i) _NGI_CLR_HOOK(i) #define NGI_SET_NODE(i,n) _NGI_SET_NODE(i,n) #define NGI_CLR_NODE(i) _NGI_CLR_NODE(i) #define NG_FREE_ITEM(item) ng_free_item((item)) #define SAVE_LINE(item) do {} while (0) #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NGI_GET_M(i,m) \ do { \ (m) = NGI_M(i); \ _NGI_M(i) = NULL; \ } while (0) #define NGI_GET_MSG(i,m) \ do { \ (m) = NGI_MSG(i); \ _NGI_MSG(i) = NULL; \ } while (0) #define NGI_GET_NODE(i,n) /* YOU NOW HAVE THE REFERENCE */ \ do { \ (n) = NGI_NODE(i); \ _NGI_NODE(i) = NULL; \ } while (0) #define NGI_GET_HOOK(i,h) \ do { \ (h) = NGI_HOOK(i); \ _NGI_HOOK(i) = NULL; \ } while (0) #define NGI_SET_WRITER(i) ((i)->el_flags &= ~NGQF_QMODE) #define NGI_SET_READER(i) ((i)->el_flags |= NGQF_QREADER) #define NGI_QUEUED_READER(i) ((i)->el_flags & NGQF_QREADER) #define NGI_QUEUED_WRITER(i) (((i)->el_flags & NGQF_QMODE) == NGQF_QWRITER) /********************************************************************** * Data macros. Send, manipulate and free. **********************************************************************/ /* * Assuming the data is already ok, just set the new address and send */ #define NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags) \ do { \ (error) = \ ng_address_hook(NULL, (item), (hook), NG_NOFLAGS); \ if (error == 0) { \ SAVE_LINE(item); \ (error) = ng_snd_item((item), (flags)); \ } \ (item) = NULL; \ } while (0) #define NG_FWD_ITEM_HOOK(error, item, hook) \ NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, NG_NOFLAGS) /* * Forward a data packet. Mbuf pointer is updated to new value. We * presume you dealt with the old one when you update it to the new one * (or it maybe the old one). We got a packet and possibly had to modify * the mbuf. You should probably use NGI_GET_M() if you are going to use * this too. */ #define NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, flags) \ do { \ NGI_M(item) = (m); \ (m) = NULL; \ NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags); \ } while (0) #define NG_FWD_NEW_DATA(error, item, hook, m) \ NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, NG_NOFLAGS) /* Send a previously unpackaged mbuf. XXX: This should be called * NG_SEND_DATA in future, but this name is kept for compatibility * reasons. */ #define NG_SEND_DATA_FLAGS(error, hook, m, flags) \ do { \ item_p _item; \ if ((_item = ng_package_data((m), flags))) { \ NG_FWD_ITEM_HOOK_FLAGS(error, _item, hook, flags);\ } else { \ (error) = ENOMEM; \ } \ (m) = NULL; \ } while (0) #define NG_SEND_DATA_ONLY(error, hook, m) \ NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS) /* NG_SEND_DATA() compat for meta-data times */ #define NG_SEND_DATA(error, hook, m, x) \ NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS) #define NG_FREE_MSG(msg) \ do { \ if ((msg)) { \ FREE((msg), M_NETGRAPH_MSG); \ (msg) = NULL; \ } \ } while (0) #define NG_FREE_M(m) \ do { \ if ((m)) { \ m_freem((m)); \ (m) = NULL; \ } \ } while (0) /***************************************** * Message macros *****************************************/ #define NG_SEND_MSG_HOOK(error, here, msg, hook, retaddr) \ do { \ item_p _item; \ if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\ (msg) = NULL; \ (error) = ENOMEM; \ break; \ } \ if (((error) = ng_address_hook((here), (_item), \ (hook), (retaddr))) == 0) { \ SAVE_LINE(_item); \ (error) = ng_snd_item((_item), 0); \ } \ (msg) = NULL; \ } while (0) #define NG_SEND_MSG_PATH(error, here, msg, path, retaddr) \ do { \ item_p _item; \ if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\ (msg) = NULL; \ (error) = ENOMEM; \ break; \ } \ if (((error) = ng_address_path((here), (_item), \ (path), (retaddr))) == 0) { \ SAVE_LINE(_item); \ (error) = ng_snd_item((_item), 0); \ } \ (msg) = NULL; \ } while (0) #define NG_SEND_MSG_ID(error, here, msg, ID, retaddr) \ do { \ item_p _item; \ if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\ (msg) = NULL; \ (error) = ENOMEM; \ break; \ } \ if (((error) = ng_address_ID((here), (_item), \ (ID), (retaddr))) == 0) { \ SAVE_LINE(_item); \ (error) = ng_snd_item((_item), 0); \ } \ (msg) = NULL; \ } while (0) /* * Redirect the message to the next hop using the given hook. * ng_retarget_msg() frees the item if there is an error * and returns an error code. It returns 0 on success. */ #define NG_FWD_MSG_HOOK(error, here, item, hook, retaddr) \ do { \ if (((error) = ng_address_hook((here), (item), \ (hook), (retaddr))) == 0) { \ SAVE_LINE(item); \ (error) = ng_snd_item((item), 0); \ } \ (item) = NULL; \ } while (0) /* * Send a queue item back to it's originator with a response message. * Assume original message was removed and freed separatly. */ #define NG_RESPOND_MSG(error, here, item, resp) \ do { \ if (resp) { \ ng_ID_t _dest = NGI_RETADDR(item); \ NGI_RETADDR(item) = 0; \ NGI_MSG(item) = resp; \ if ((error = ng_address_ID((here), (item), \ _dest, 0)) == 0) { \ SAVE_LINE(item); \ (error) = ng_snd_item((item), NG_QUEUE);\ } \ } else \ NG_FREE_ITEM(item); \ (item) = NULL; \ } while (0) /*********************************************************************** ******** Structures Definitions and Macros for defining a node ******* *********************************************************************** * * Here we define the structures needed to actually define a new node * type. */ /* * Command list -- each node type specifies the command that it knows * how to convert between ASCII and binary using an array of these. * The last element in the array must be a terminator with cookie=0. */ struct ng_cmdlist { u_int32_t cookie; /* command typecookie */ int cmd; /* command number */ const char *name; /* command name */ const struct ng_parse_type *mesgType; /* args if !NGF_RESP */ const struct ng_parse_type *respType; /* args if NGF_RESP */ }; /* * Structure of a node type * If data is sent to the "rcvdata()" entrypoint then the system * may decide to defer it until later by queing it with the normal netgraph * input queuing system. This is decidde by the HK_QUEUE flag being set in * the flags word of the peer (receiving) hook. The dequeuing mechanism will * ensure it is not requeued again. * Note the input queueing system is to allow modules * to 'release the stack' or to pass data across spl layers. * The data will be redelivered as soon as the NETISR code runs * which may be almost immediatly. A node may also do it's own queueing * for other reasons (e.g. device output queuing). */ struct ng_type { u_int32_t version; /* must equal NG_API_VERSION */ const char *name; /* Unique type name */ modeventhand_t mod_event; /* Module event handler (optional) */ ng_constructor_t *constructor; /* Node constructor */ ng_rcvmsg_t *rcvmsg; /* control messages come here */ ng_close_t *close; /* warn about forthcoming shutdown */ ng_shutdown_t *shutdown; /* reset, and free resources */ ng_newhook_t *newhook; /* first notification of new hook */ ng_findhook_t *findhook; /* only if you have lots of hooks */ ng_connect_t *connect; /* final notification of new hook */ ng_rcvdata_t *rcvdata; /* data comes here */ ng_disconnect_t *disconnect; /* notify on disconnect */ const struct ng_cmdlist *cmdlist; /* commands we can convert */ /* R/W data private to the base netgraph code DON'T TOUCH! */ LIST_ENTRY(ng_type) types; /* linked list of all types */ int refs; /* number of instances */ }; /* * Use the NETGRAPH_INIT() macro to link a node type into the * netgraph system. This works for types compiled into the kernel * as well as KLD modules. The first argument should be the type * name (eg, echo) and the second a pointer to the type struct. * * If a different link time is desired, e.g., a device driver that * needs to install its netgraph type before probing, use the * NETGRAPH_INIT_ORDERED() macro instead. Device drivers probably * want to use SI_SUB_DRIVERS/SI_ORDER_FIRST. */ #define NETGRAPH_INIT_ORDERED(typename, typestructp, sub, order) \ static moduledata_t ng_##typename##_mod = { \ "ng_" #typename, \ ng_mod_event, \ (typestructp) \ }; \ DECLARE_MODULE(ng_##typename, ng_##typename##_mod, sub, order); \ MODULE_DEPEND(ng_##typename, netgraph, NG_ABI_VERSION, \ NG_ABI_VERSION, \ NG_ABI_VERSION) #define NETGRAPH_INIT(tn, tp) \ NETGRAPH_INIT_ORDERED(tn, tp, SI_SUB_PSEUDO, SI_ORDER_ANY) /* Special malloc() type for netgraph structs and ctrl messages */ /* Only these two types should be visible to nodes */ MALLOC_DECLARE(M_NETGRAPH); MALLOC_DECLARE(M_NETGRAPH_MSG); /* declare the base of the netgraph sysclt hierarchy */ /* but only if this file cares about sysctls */ #ifdef SYSCTL_DECL SYSCTL_DECL(_net_graph); #endif /* * Methods that the nodes can use. * Many of these methods should usually NOT be used directly but via * Macros above. */ int ng_address_ID(node_p here, item_p item, ng_ID_t ID, ng_ID_t retaddr); int ng_address_hook(node_p here, item_p item, hook_p hook, ng_ID_t retaddr); int ng_address_path(node_p here, item_p item, char *address, ng_ID_t raddr); int ng_bypass(hook_p hook1, hook_p hook2); hook_p ng_findhook(node_p node, const char *name); struct ng_type *ng_findtype(const char *type); int ng_make_node_common(struct ng_type *typep, node_p *nodep); int ng_name_node(node_p node, const char *name); int ng_newtype(struct ng_type *tp); ng_ID_t ng_node2ID(node_p node); item_p ng_package_data(struct mbuf *m, int flags); item_p ng_package_msg(struct ng_mesg *msg, int flags); item_p ng_package_msg_self(node_p here, hook_p hook, struct ng_mesg *msg); void ng_replace_retaddr(node_p here, item_p item, ng_ID_t retaddr); int ng_rmhook_self(hook_p hook); /* if a node wants to kill a hook */ int ng_rmnode_self(node_p here); /* if a node wants to suicide */ int ng_rmtype(struct ng_type *tp); int ng_snd_item(item_p item, int queue); int ng_send_fn1(node_p node, hook_p hook, ng_item_fn *fn, void *arg1, int arg2, int flags); #define ng_send_fn(node, hook, fn, arg1, arg2) \ ng_send_fn1(node, hook, fn, arg1, arg2, NG_NOFLAGS) int ng_uncallout(struct callout *c, node_p node); int ng_callout(struct callout *c, node_p node, hook_p hook, int ticks, ng_item_fn *fn, void * arg1, int arg2); -#define ng_callout_init(c) callout_init(c, NET_CALLOUT_MPSAFE) +#define ng_callout_init(c) callout_init(c, CALLOUT_MPSAFE) /* Flags for netgraph functions. */ #define NG_NOFLAGS 0x00000000 /* no special options */ #define NG_QUEUE 0x00000001 /* enqueue item, don't dispatch */ #define NG_WAITOK 0x00000002 /* use M_WAITOK, etc. */ #define NG_PROGRESS 0x00000004 /* return EINPROGRESS if queued */ /* * prototypes the user should DEFINITELY not use directly */ void ng_free_item(item_p item); /* Use NG_FREE_ITEM instead */ int ng_mod_event(module_t mod, int what, void *arg); /* * Tag definitions and constants */ #define NG_TAG_PRIO 1 struct ng_tag_prio { struct m_tag tag; char priority; char discardability; }; #define NG_PRIO_CUTOFF 32 #define NG_PRIO_LINKSTATE 64 /* Macros and declarations to keep compatibility with metadata, which * is obsoleted now. To be deleted. */ typedef void *meta_p; #define _NGI_META(i) NULL #define NGI_META(i) NULL #define NG_FREE_META(meta) #define NGI_GET_META(i,m) #define ng_copy_meta(meta) NULL #endif /* _NETGRAPH_NETGRAPH_H_ */ Index: head/sys/netinet/ip_carp.c =================================================================== --- head/sys/netinet/ip_carp.c (revision 171636) +++ head/sys/netinet/ip_carp.c (revision 171637) @@ -1,2220 +1,2220 @@ /* $FreeBSD$ */ /* * Copyright (c) 2002 Michael Shalayeff. All rights reserved. * Copyright (c) 2003 Ryan McBride. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_carp.h" #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #include #include #include #include #endif #ifdef INET6 #include #include #include #include #include #endif #include #include #define CARP_IFNAME "carp" static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); SYSCTL_DECL(_net_inet_carp); struct carp_softc { struct ifnet *sc_ifp; /* Interface clue */ struct ifnet *sc_carpdev; /* Pointer to parent interface */ struct in_ifaddr *sc_ia; /* primary iface address */ struct ip_moptions sc_imo; #ifdef INET6 struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ struct ip6_moptions sc_im6o; #endif /* INET6 */ TAILQ_ENTRY(carp_softc) sc_list; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_flags_backup; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_vhid; int sc_advskew; int sc_naddrs; int sc_naddrs6; int sc_advbase; /* seconds */ int sc_init_counter; u_int64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; struct callout sc_ad_tmo; /* advertisement timeout */ struct callout sc_md_tmo; /* master down timeout */ struct callout sc_md6_tmo; /* master down timeout */ LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ }; #define SC2IFP(sc) ((sc)->sc_ifp) int carp_suppress_preempt = 0; int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW, &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD, &carp_suppress_preempt, 0, "Preemption is suppressed"); struct carpstats carpstats; SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW, &carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); struct carp_if { TAILQ_HEAD(, carp_softc) vhif_vrs; int vhif_nvrs; struct ifnet *vhif_ifp; struct mtx vhif_mtx; }; /* Get carp_if from softc. Valid after carp_set_addr{,6}. */ #define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp) /* lock per carp_if queue */ #define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx) #define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) #define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) #define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) #define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx) #define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx) #define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED) #define CARP_LOG(...) do { \ if (carp_opts[CARPCTL_LOG] > 0) \ log(LOG_INFO, __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (carp_opts[CARPCTL_LOG] > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) static void carp_hmac_prepare(struct carp_softc *); static void carp_hmac_generate(struct carp_softc *, u_int32_t *, unsigned char *); static int carp_hmac_verify(struct carp_softc *, u_int32_t *, unsigned char *); static void carp_setroute(struct carp_softc *, int); static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); static int carp_clone_create(struct if_clone *, int, caddr_t); static void carp_clone_destroy(struct ifnet *); static void carpdetach(struct carp_softc *, int); static int carp_prepare_ad(struct mbuf *, struct carp_softc *, struct carp_header *); static void carp_send_ad_all(void); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_send_arp(struct carp_softc *); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *); static int carp_ioctl(struct ifnet *, u_long, caddr_t); static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static void carp_start(struct ifnet *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_set_state(struct carp_softc *, int); static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; static void carp_multicast_cleanup(struct carp_softc *); static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); static void carp_carpdev_state_locked(struct carp_if *); static void carp_sc_state_locked(struct carp_softc *); #ifdef INET6 static void carp_send_na(struct carp_softc *); static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); static void carp_multicast6_cleanup(struct carp_softc *); #endif static LIST_HEAD(, carp_softc) carpif_list; static struct mtx carp_mtx; IFC_SIMPLE_DECLARE(carp, 0); static eventhandler_tag if_detach_event_tag; static __inline u_int16_t carp_cksum(struct mbuf *m, int len) { return (in_cksum(m, len)); } static void carp_hmac_prepare(struct carp_softc *sc) { u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; u_int8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i; #ifdef INET6 struct in6_addr in6; #endif if (sc->sc_carpdev) CARP_SCLOCK(sc); /* XXX: possible race here */ /* compute ipad from key */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* precompute first part of inner hash */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET) SHA1Update(&sc->sc_sha1, (void *)&ifatoia(ifa)->ia_addr.sin_addr.s_addr, sizeof(struct in_addr)); } #endif /* INET */ #ifdef INET6 TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET6) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; in6_clearscope(&in6); SHA1Update(&sc->sc_sha1, (void *)&in6, sizeof(in6)); } } #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; if (sc->sc_carpdev) CARP_SCUNLOCK(sc); } static void carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_SCLOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } static void carp_setroute(struct carp_softc *sc, int cmd) { struct ifaddr *ifa; int s; if (sc->sc_carpdev) CARP_SCLOCK_ASSERT(sc); s = splnet(); TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET && sc->sc_carpdev != NULL) { int count = carp_addrcount( (struct carp_if *)sc->sc_carpdev->if_carp, ifatoia(ifa), CARP_COUNT_MASTER); if ((cmd == RTM_ADD && count == 1) || (cmd == RTM_DELETE && count == 0)) rtinit(ifa, cmd, RTF_UP | RTF_HOST); } #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { if (cmd == RTM_ADD) in6_ifaddloop(ifa); else in6_ifremloop(ifa); } #endif /* INET6 */ } splx(s); } static int carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct carp_softc *sc; struct ifnet *ifp; MALLOC(sc, struct carp_softc *, sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); ifp = SC2IFP(sc) = if_alloc(IFT_ETHER); if (ifp == NULL) { FREE(sc, M_CARP); return (ENOSPC); } sc->sc_flags_backup = 0; sc->sc_suppress = 0; sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_advskew = 0; sc->sc_init_counter = 1; sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ #ifdef INET6 sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; #endif sc->sc_imo.imo_membership = (struct in_multi **)malloc( (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, M_WAITOK); sc->sc_imo.imo_mfilters = NULL; sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; sc->sc_imo.imo_multicast_vif = -1; - callout_init(&sc->sc_ad_tmo, NET_CALLOUT_MPSAFE); - callout_init(&sc->sc_md_tmo, NET_CALLOUT_MPSAFE); - callout_init(&sc->sc_md6_tmo, NET_CALLOUT_MPSAFE); + callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE); + callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE); + callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE); ifp->if_softc = sc; if_initname(ifp, CARP_IFNAME, unit); ifp->if_mtu = ETHERMTU; ifp->if_flags = IFF_LOOPBACK; ifp->if_ioctl = carp_ioctl; ifp->if_output = carp_looutput; ifp->if_start = carp_start; ifp->if_type = IFT_CARP; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = 0; if_attach(ifp); bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t)); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carpif_list, sc, sc_next); mtx_unlock(&carp_mtx); return (0); } static void carp_clone_destroy(struct ifnet *ifp) { struct carp_softc *sc = ifp->if_softc; if (sc->sc_carpdev) CARP_SCLOCK(sc); carpdetach(sc, 1); /* Returns unlocked. */ mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); bpfdetach(ifp); if_detach(ifp); if_free_type(ifp, IFT_ETHER); free(sc->sc_imo.imo_membership, M_CARP); free(sc, M_CARP); } /* * This function can be called on CARP interface destroy path, * and in case of the removal of the underlying interface as * well. We differentiate these two cases. In the latter case * we do not cleanup our multicast memberships, since they * are already freed. Also, in the latter case we do not * release the lock on return, because the function will be * called once more, for another CARP instance on the same * interface. */ static void carpdetach(struct carp_softc *sc, int unlock) { struct carp_if *cif; callout_stop(&sc->sc_ad_tmo); callout_stop(&sc->sc_md_tmo); callout_stop(&sc->sc_md6_tmo); if (sc->sc_suppress) carp_suppress_preempt--; sc->sc_suppress = 0; if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) carp_suppress_preempt--; sc->sc_sendad_errors = 0; carp_set_state(sc, INIT); SC2IFP(sc)->if_flags &= ~IFF_UP; carp_setrun(sc, 0); if (unlock) carp_multicast_cleanup(sc); #ifdef INET6 carp_multicast6_cleanup(sc); #endif if (sc->sc_carpdev != NULL) { cif = (struct carp_if *)sc->sc_carpdev->if_carp; CARP_LOCK_ASSERT(cif); TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); if (!--cif->vhif_nvrs) { ifpromisc(sc->sc_carpdev, 0); sc->sc_carpdev->if_carp = NULL; CARP_LOCK_DESTROY(cif); FREE(cif, M_IFADDR); } else if (unlock) CARP_UNLOCK(cif); sc->sc_carpdev = NULL; } } /* Detach an interface from the carp. */ static void carp_ifdetach(void *arg __unused, struct ifnet *ifp) { struct carp_if *cif = (struct carp_if *)ifp->if_carp; struct carp_softc *sc, *nextsc; if (cif == NULL) return; /* * XXX: At the end of for() cycle the lock will be destroyed. */ CARP_LOCK(cif); for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) { nextsc = TAILQ_NEXT(sc, sc_list); carpdetach(sc, 0); } } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ void carp_input(struct mbuf *m, int hlen) { struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; carpstats.carps_ipackets++; if (!carp_opts[CARPCTL_ALLOW]) { m_freem(m); return; } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { carpstats.carps_badif++; CARP_LOG("carp_input: packet received on non-carp " "interface: %s\n", m->m_pkthdr.rcvif->if_xname); m_freem(m); return; } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { carpstats.carps_badttl++; CARP_LOG("carp_input: received ttl %d != 255i on %s\n", ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); return; } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { carpstats.carps_badlen++; CARP_LOG("carp_input: received len %zd < " "sizeof(struct carp_header)\n", m->m_len - sizeof(struct ip)); m_freem(m); return; } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { carpstats.carps_hdrops++; CARP_LOG("carp_input: pullup failed\n"); return; } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { carpstats.carps_badlen++; CARP_LOG("carp_input: packet too short %d on %s\n", m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); return; } if ((m = m_pullup(m, len)) == NULL) { carpstats.carps_hdrops++; return; } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (carp_cksum(m, len - iplen)) { carpstats.carps_badsum++; CARP_LOG("carp_input: checksum failed on %s\n", m->m_pkthdr.rcvif->if_xname); m_freem(m); return; } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); } #ifdef INET6 int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; carpstats.carps_ipackets6++; if (!carp_opts[CARPCTL_ALLOW]) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { carpstats.carps_badif++; CARP_LOG("carp6_input: packet received on non-carp " "interface: %s\n", m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { carpstats.carps_badttl++; CARP_LOG("carp6_input: received ttl %d != 255 on %s\n", ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ len = m->m_len; IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { carpstats.carps_badlen++; CARP_LOG("carp6_input: packet size %u too small\n", len); return (IPPROTO_DONE); } /* verify the CARP checksum */ m->m_data += *offp; if (carp_cksum(m, sizeof(*ch))) { carpstats.carps_badsum++; CARP_LOG("carp6_input: checksum failed, on %s\n", m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6); return (IPPROTO_DONE); } #endif /* INET6 */ static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct carp_softc *sc; u_int64_t tmp_counter; struct timeval sc_tv, ch_tv; /* verify that the VHID is valid on the receiving interface */ CARP_LOCK(ifp->if_carp); TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) if (sc->sc_vhid == ch->carp_vhid) break; if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) && (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { carpstats.carps_badvhid++; CARP_UNLOCK(ifp->if_carp); m_freem(m); return; } getmicrotime(&SC2IFP(sc)->if_lastchange); SC2IFP(sc)->if_ipackets++; SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { struct ip *ip = mtod(m, struct ip *); uint32_t af1 = af; /* BPF wants net byte order */ ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); ip->ip_off = htons(ip->ip_off); bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { carpstats.carps_badver++; SC2IFP(sc)->if_ierrors++; CARP_UNLOCK(ifp->if_carp); CARP_LOG("%s; invalid version %d\n", SC2IFP(sc)->if_xname, ch->carp_version); m_freem(m); return; } /* verify the hash */ if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { carpstats.carps_badauth++; SC2IFP(sc)->if_ierrors++; CARP_UNLOCK(ifp->if_carp); CARP_LOG("%s: incorrect hash\n", SC2IFP(sc)->if_xname); m_freem(m); return; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; if (carp_suppress_preempt && sc->sc_advskew < 240) sc_tv.tv_usec = 240 * 1000000 / 256; else sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); CARP_DEBUG("%s: MASTER -> BACKUP " "(more frequent advertisement received)\n", SC2IFP(sc)->if_xname); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); carp_setroute(sc, RTM_DELETE); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (carp_opts[CARPCTL_PREEMPT] && timevalcmp(&sc_tv, &ch_tv, <)) { CARP_DEBUG("%s: BACKUP -> MASTER " "(preempting a slower master)\n", SC2IFP(sc)->if_xname); carp_master_down_locked(sc); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { CARP_DEBUG("%s: BACKUP -> MASTER " "(master timed out)\n", SC2IFP(sc)->if_xname); carp_master_down_locked(sc); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } CARP_UNLOCK(ifp->if_carp); m_freem(m); return; } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; struct ifnet *ifp = SC2IFP(sc); if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) { m_freem(m); SC2IFP(sc)->if_oerrors++; return (ENOMEM); } bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); m_tag_prepend(m, mtag); return (0); } static void carp_send_ad_all(void) { struct carp_softc *sc; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carpif_list, sc_next) { if (sc->sc_carpdev == NULL) continue; CARP_SCLOCK(sc); if ((SC2IFP(sc)->if_flags & IFF_UP) && (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) && sc->sc_state == MASTER) carp_send_ad_locked(sc); CARP_SCUNLOCK(sc); } mtx_unlock(&carp_mtx); } static void carp_send_ad(void *v) { struct carp_softc *sc = v; CARP_SCLOCK(sc); carp_send_ad_locked(sc); CARP_SCUNLOCK(sc); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct carp_header *ch_ptr; struct mbuf *m; int len, advbase, advskew; CARP_SCLOCK_ASSERT(sc); /* bow out if we've lost our UPness or RUNNINGuiness */ if (!((SC2IFP(sc)->if_flags & IFF_UP) && (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { advbase = 255; advskew = 255; } else { advbase = sc->sc_advbase; if (!carp_suppress_preempt || sc->sc_advskew > 240) advskew = sc->sc_advskew; else advskew = 240; tv.tv_sec = advbase; tv.tv_usec = advskew * 1000000 / 256; } ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; #ifdef INET if (sc->sc_ia) { struct ip *ip; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { SC2IFP(sc)->if_oerrors++; carpstats.carps_onomem++; /* XXX maybe less ? */ if (advbase != 255 || advskew != 255) callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); return; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; MH_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = len; ip->ip_id = ip_newid(); ip->ip_off = IP_DF; ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) return; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); getmicrotime(&SC2IFP(sc)->if_lastchange); SC2IFP(sc)->if_opackets++; SC2IFP(sc)->if_obytes += len; carpstats.carps_opackets++; if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { SC2IFP(sc)->if_oerrors++; if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; if (carp_suppress_preempt == 1) { CARP_SCUNLOCK(sc); carp_send_ad_all(); CARP_SCLOCK(sc); } } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { carp_suppress_preempt--; sc->sc_sendad_errors = 0; } } else sc->sc_sendad_errors = 0; } } #endif /* INET */ #ifdef INET6 if (sc->sc_ia6) { struct ip6_hdr *ip6; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { SC2IFP(sc)->if_oerrors++; carpstats.carps_onomem++; /* XXX maybe less ? */ if (advbase != 255 || advskew != 255) callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); return; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; MH_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); /* set the multicast destination */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { SC2IFP(sc)->if_oerrors++; m_freem(m); CARP_LOG("%s: in6_setscope failed\n", __func__); return; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) return; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); getmicrotime(&SC2IFP(sc)->if_lastchange); SC2IFP(sc)->if_opackets++; SC2IFP(sc)->if_obytes += len; carpstats.carps_opackets6++; if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { SC2IFP(sc)->if_oerrors++; if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; if (carp_suppress_preempt == 1) { CARP_SCUNLOCK(sc); carp_send_ad_all(); CARP_SCLOCK(sc); } } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { carp_suppress_preempt--; sc->sc_sendad_errors = 0; } } else sc->sc_sendad_errors = 0; } } #endif /* INET6 */ if (advbase != 255 || advskew != 255) callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */ arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp)); DELAY(1000); /* XXX */ } } #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { struct ifaddr *ifa; struct in6_addr *in6; static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } #endif /* INET6 */ static int carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) { struct carp_softc *vh; struct ifaddr *ifa; int count = 0; CARP_LOCK_ASSERT(cif); TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { if ((type == CARP_COUNT_RUNNING && (SC2IFP(vh)->if_flags & IFF_UP) && (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) || (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET && ia->ia_addr.sin_addr.s_addr == ifatoia(ifa)->ia_addr.sin_addr.s_addr) count++; } } } return (count); } int carp_iamatch(void *v, struct in_ifaddr *ia, struct in_addr *isaddr, u_int8_t **enaddr) { struct carp_if *cif = v; struct carp_softc *vh; int index, count = 0; struct ifaddr *ifa; CARP_LOCK(cif); if (carp_opts[CARPCTL_ARPBALANCE]) { /* * XXX proof of concept implementation. * We use the source ip to decide which virtual host should * handle the request. If we're master of that virtual host, * then we respond, otherwise, just drop the arp packet on * the floor. */ count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); if (count == 0) { /* should never reach this */ CARP_UNLOCK(cif); return (0); } /* this should be a hash, like pf_hash() */ index = ntohl(isaddr->s_addr) % count; count = 0; TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { if ((SC2IFP(vh)->if_flags & IFF_UP) && (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) { TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET && ia->ia_addr.sin_addr.s_addr == ifatoia(ifa)->ia_addr.sin_addr.s_addr) { if (count == index) { if (vh->sc_state == MASTER) { *enaddr = IF_LLADDR(vh->sc_ifp); CARP_UNLOCK(cif); return (1); } else { CARP_UNLOCK(cif); return (0); } } count++; } } } } } else { TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { if ((SC2IFP(vh)->if_flags & IFF_UP) && (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && ia->ia_ifp == SC2IFP(vh) && vh->sc_state == MASTER) { *enaddr = IF_LLADDR(vh->sc_ifp); CARP_UNLOCK(cif); return (1); } } } CARP_UNLOCK(cif); return (0); } #ifdef INET6 struct ifaddr * carp_iamatch6(void *v, struct in6_addr *taddr) { struct carp_if *cif = v; struct carp_softc *vh; struct ifaddr *ifa; CARP_LOCK(cif); TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { if (IN6_ARE_ADDR_EQUAL(taddr, &ifatoia6(ifa)->ia_addr.sin6_addr) && (SC2IFP(vh)->if_flags & IFF_UP) && (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && vh->sc_state == MASTER) { CARP_UNLOCK(cif); return (ifa); } } } CARP_UNLOCK(cif); return (NULL); } void * carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr) { struct m_tag *mtag; struct carp_if *cif = v; struct carp_softc *sc; struct ifaddr *ifa; CARP_LOCK(cif); TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { if (IN6_ARE_ADDR_EQUAL(taddr, &ifatoia6(ifa)->ia_addr.sin6_addr) && (SC2IFP(sc)->if_flags & IFF_UP) && (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) { struct ifnet *ifp = SC2IFP(sc); mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) { /* better a bit than nothing */ CARP_UNLOCK(cif); return (IF_LLADDR(sc->sc_ifp)); } bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); m_tag_prepend(m, mtag); CARP_UNLOCK(cif); return (IF_LLADDR(sc->sc_ifp)); } } } CARP_UNLOCK(cif); return (NULL); } #endif struct ifnet * carp_forus(void *v, void *dhost) { struct carp_if *cif = v; struct carp_softc *vh; u_int8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (NULL); CARP_LOCK(cif); TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) if ((SC2IFP(vh)->if_flags & IFF_UP) && (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && vh->sc_state == MASTER && !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) { CARP_UNLOCK(cif); return (SC2IFP(vh)); } CARP_UNLOCK(cif); return (NULL); } static void carp_master_down(void *v) { struct carp_softc *sc = v; CARP_SCLOCK(sc); carp_master_down_locked(sc); CARP_SCUNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc) { if (sc->sc_carpdev) CARP_SCLOCK_ASSERT(sc); switch (sc->sc_state) { case INIT: printf("%s: master_down event in INIT state\n", SC2IFP(sc)->if_xname); break; case MASTER: break; case BACKUP: carp_set_state(sc, MASTER); carp_send_ad_locked(sc); carp_send_arp(sc); #ifdef INET6 carp_send_na(sc); #endif /* INET6 */ carp_setrun(sc, 0); carp_setroute(sc, RTM_ADD); break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; if (sc->sc_carpdev == NULL) { SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; carp_set_state(sc, INIT); return; } else CARP_SCLOCK_ASSERT(sc); if (SC2IFP(sc)->if_flags & IFF_UP && sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6)) SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else { SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; carp_setroute(sc, RTM_DELETE); return; } switch (sc->sc_state) { case INIT: if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) { carp_send_ad_locked(sc); carp_send_arp(sc); #ifdef INET6 carp_send_na(sc); #endif /* INET6 */ CARP_DEBUG("%s: INIT -> MASTER (preempting)\n", SC2IFP(sc)->if_xname); carp_set_state(sc, MASTER); carp_setroute(sc, RTM_ADD); } else { CARP_DEBUG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname); carp_set_state(sc, BACKUP); carp_setroute(sc, RTM_DELETE); carp_setrun(sc, 0); } break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif /* INET */ #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif /* INET6 */ default: if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } static void carp_multicast_cleanup(struct carp_softc *sc) { struct ip_moptions *imo = &sc->sc_imo; u_int16_t n = imo->imo_num_memberships; /* Clean up our own multicast memberships */ while (n-- > 0) { if (imo->imo_membership[n] != NULL) { in_delmulti(imo->imo_membership[n]); imo->imo_membership[n] = NULL; } } KASSERT(imo->imo_mfilters == NULL, ("%s: imo_mfilters != NULL", __func__)); imo->imo_num_memberships = 0; imo->imo_multicast_ifp = NULL; } #ifdef INET6 static void carp_multicast6_cleanup(struct carp_softc *sc) { struct ip6_moptions *im6o = &sc->sc_im6o; while (!LIST_EMPTY(&im6o->im6o_memberships)) { struct in6_multi_mship *imm = LIST_FIRST(&im6o->im6o_memberships); LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } im6o->im6o_multicast_ifp = NULL; } #endif static int carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) { struct ifnet *ifp; struct carp_if *cif; struct in_ifaddr *ia, *ia_if; struct ip_moptions *imo = &sc->sc_imo; struct in_addr addr; u_long iaddr = htonl(sin->sin_addr.s_addr); int own, error; if (sin->sin_addr.s_addr == 0) { if (!(SC2IFP(sc)->if_flags & IFF_UP)) carp_set_state(sc, INIT); if (sc->sc_naddrs) SC2IFP(sc)->if_flags |= IFF_UP; carp_setrun(sc, 0); return (0); } /* we have to do it by hands to check we won't match on us */ ia_if = NULL; own = 0; TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { /* and, yeah, we need a multicast-capable iface too */ if (ia->ia_ifp != SC2IFP(sc) && (ia->ia_ifp->if_flags & IFF_MULTICAST) && (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { if (!ia_if) ia_if = ia; if (sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) own++; } } if (!ia_if) return (EADDRNOTAVAIL); ia = ia_if; ifp = ia->ia_ifp; if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) return (EADDRNOTAVAIL); if (imo->imo_num_memberships == 0) { addr.s_addr = htonl(INADDR_CARP_GROUP); if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL) return (ENOBUFS); imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; } if (!ifp->if_carp) { MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if (!cif) { error = ENOBUFS; goto cleanup; } if ((error = ifpromisc(ifp, 1))) { FREE(cif, M_CARP); goto cleanup; } CARP_LOCK_INIT(cif); CARP_LOCK(cif); cif->vhif_ifp = ifp; TAILQ_INIT(&cif->vhif_vrs); ifp->if_carp = cif; } else { struct carp_softc *vr; cif = (struct carp_if *)ifp->if_carp; CARP_LOCK(cif); TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) if (vr != sc && vr->sc_vhid == sc->sc_vhid) { CARP_UNLOCK(cif); error = EINVAL; goto cleanup; } } sc->sc_ia = ia; sc->sc_carpdev = ifp; { /* XXX prevent endless loop if already in queue */ struct carp_softc *vr, *after = NULL; int myself = 0; cif = (struct carp_if *)ifp->if_carp; /* XXX: cif should not change, right? So we still hold the lock */ CARP_LOCK_ASSERT(cif); TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { if (vr == sc) myself = 1; if (vr->sc_vhid < sc->sc_vhid) after = vr; } if (!myself) { /* We're trying to keep things in order */ if (after == NULL) { TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); } else { TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); } cif->vhif_nvrs++; } } sc->sc_naddrs++; SC2IFP(sc)->if_flags |= IFF_UP; if (own) sc->sc_advskew = 0; carp_sc_state_locked(sc); carp_setrun(sc, 0); CARP_UNLOCK(cif); return (0); cleanup: in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); return (error); } static int carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) { int error = 0; if (!--sc->sc_naddrs) { struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; struct ip_moptions *imo = &sc->sc_imo; CARP_LOCK(cif); callout_stop(&sc->sc_ad_tmo); SC2IFP(sc)->if_flags &= ~IFF_UP; SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; sc->sc_vhid = -1; in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); imo->imo_multicast_ifp = NULL; TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); if (!--cif->vhif_nvrs) { sc->sc_carpdev->if_carp = NULL; CARP_LOCK_DESTROY(cif); FREE(cif, M_IFADDR); } else { CARP_UNLOCK(cif); } } return (error); } #ifdef INET6 static int carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) { struct ifnet *ifp; struct carp_if *cif; struct in6_ifaddr *ia, *ia_if; struct ip6_moptions *im6o = &sc->sc_im6o; struct in6_multi_mship *imm; struct in6_addr in6; int own, error; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { if (!(SC2IFP(sc)->if_flags & IFF_UP)) carp_set_state(sc, INIT); if (sc->sc_naddrs6) SC2IFP(sc)->if_flags |= IFF_UP; carp_setrun(sc, 0); return (0); } /* we have to do it by hands to check we won't match on us */ ia_if = NULL; own = 0; for (ia = in6_ifaddr; ia; ia = ia->ia_next) { int i; for (i = 0; i < 4; i++) { if ((sin6->sin6_addr.s6_addr32[i] & ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != (ia->ia_addr.sin6_addr.s6_addr32[i] & ia->ia_prefixmask.sin6_addr.s6_addr32[i])) break; } /* and, yeah, we need a multicast-capable iface too */ if (ia->ia_ifp != SC2IFP(sc) && (ia->ia_ifp->if_flags & IFF_MULTICAST) && (i == 4)) { if (!ia_if) ia_if = ia; if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ia->ia_addr.sin6_addr)) own++; } } if (!ia_if) return (EADDRNOTAVAIL); ia = ia_if; ifp = ia->ia_ifp; if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) return (EADDRNOTAVAIL); if (!sc->sc_naddrs6) { im6o->im6o_multicast_ifp = ifp; /* join CARP multicast address */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if (in6_setscope(&in6, ifp, NULL) != 0) goto cleanup; if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL) goto cleanup; LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); /* join solicited multicast address */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; in6.s6_addr8[12] = 0xff; if (in6_setscope(&in6, ifp, NULL) != 0) goto cleanup; if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL) goto cleanup; LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); } if (!ifp->if_carp) { MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if (!cif) { error = ENOBUFS; goto cleanup; } if ((error = ifpromisc(ifp, 1))) { FREE(cif, M_CARP); goto cleanup; } CARP_LOCK_INIT(cif); CARP_LOCK(cif); cif->vhif_ifp = ifp; TAILQ_INIT(&cif->vhif_vrs); ifp->if_carp = cif; } else { struct carp_softc *vr; cif = (struct carp_if *)ifp->if_carp; CARP_LOCK(cif); TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) if (vr != sc && vr->sc_vhid == sc->sc_vhid) { CARP_UNLOCK(cif); error = EINVAL; goto cleanup; } } sc->sc_ia6 = ia; sc->sc_carpdev = ifp; { /* XXX prevent endless loop if already in queue */ struct carp_softc *vr, *after = NULL; int myself = 0; cif = (struct carp_if *)ifp->if_carp; CARP_LOCK_ASSERT(cif); TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { if (vr == sc) myself = 1; if (vr->sc_vhid < sc->sc_vhid) after = vr; } if (!myself) { /* We're trying to keep things in order */ if (after == NULL) { TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); } else { TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); } cif->vhif_nvrs++; } } sc->sc_naddrs6++; SC2IFP(sc)->if_flags |= IFF_UP; if (own) sc->sc_advskew = 0; carp_sc_state_locked(sc); carp_setrun(sc, 0); CARP_UNLOCK(cif); return (0); cleanup: /* clean up multicast memberships */ if (!sc->sc_naddrs6) { while (!LIST_EMPTY(&im6o->im6o_memberships)) { imm = LIST_FIRST(&im6o->im6o_memberships); LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } } return (error); } static int carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) { int error = 0; if (!--sc->sc_naddrs6) { struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; struct ip6_moptions *im6o = &sc->sc_im6o; CARP_LOCK(cif); callout_stop(&sc->sc_ad_tmo); SC2IFP(sc)->if_flags &= ~IFF_UP; SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; sc->sc_vhid = -1; while (!LIST_EMPTY(&im6o->im6o_memberships)) { struct in6_multi_mship *imm = LIST_FIRST(&im6o->im6o_memberships); LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } im6o->im6o_multicast_ifp = NULL; TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); if (!--cif->vhif_nvrs) { CARP_LOCK_DESTROY(cif); sc->sc_carpdev->if_carp = NULL; FREE(cif, M_IFADDR); } else CARP_UNLOCK(cif); } return (error); } #endif /* INET6 */ static int carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) { struct carp_softc *sc = ifp->if_softc, *vr; struct carpreq carpr; struct ifaddr *ifa; struct ifreq *ifr; struct ifaliasreq *ifra; int locked = 0, error = 0; ifa = (struct ifaddr *)addr; ifra = (struct ifaliasreq *)addr; ifr = (struct ifreq *)addr; switch (cmd) { case SIOCSIFADDR: switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: SC2IFP(sc)->if_flags |= IFF_UP; bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, sizeof(struct sockaddr)); error = carp_set_addr(sc, satosin(ifa->ifa_addr)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: SC2IFP(sc)->if_flags |= IFF_UP; error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); break; #endif /* INET6 */ default: error = EAFNOSUPPORT; break; } break; case SIOCAIFADDR: switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: SC2IFP(sc)->if_flags |= IFF_UP; bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, sizeof(struct sockaddr)); error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: SC2IFP(sc)->if_flags |= IFF_UP; error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); break; #endif /* INET6 */ default: error = EAFNOSUPPORT; break; } break; case SIOCDIFADDR: switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); break; #endif /* INET6 */ default: error = EAFNOSUPPORT; break; } break; case SIOCSIFFLAGS: if (sc->sc_carpdev) { locked = 1; CARP_SCLOCK(sc); } if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { callout_stop(&sc->sc_ad_tmo); callout_stop(&sc->sc_md_tmo); callout_stop(&sc->sc_md6_tmo); if (sc->sc_state == MASTER) carp_send_ad_locked(sc); carp_set_state(sc, INIT); carp_setrun(sc, 0); } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { SC2IFP(sc)->if_flags |= IFF_UP; carp_setrun(sc, 0); } break; case SIOCSVH: error = priv_check(curthread, PRIV_NETINET_CARP); if (error) break; if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) break; error = 1; if (sc->sc_carpdev) { locked = 1; CARP_SCLOCK(sc); } if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); carp_setroute(sc, RTM_DELETE); break; case MASTER: carp_master_down_locked(sc); break; default: break; } } if (carpr.carpr_vhid > 0) { if (carpr.carpr_vhid > 255) { error = EINVAL; break; } if (sc->sc_carpdev) { struct carp_if *cif; cif = (struct carp_if *)sc->sc_carpdev->if_carp; TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) if (vr != sc && vr->sc_vhid == carpr.carpr_vhid) { error = EEXIST; break; } if (error == EEXIST) break; } sc->sc_vhid = carpr.carpr_vhid; IF_LLADDR(sc->sc_ifp)[0] = 0; IF_LLADDR(sc->sc_ifp)[1] = 0; IF_LLADDR(sc->sc_ifp)[2] = 0x5e; IF_LLADDR(sc->sc_ifp)[3] = 0; IF_LLADDR(sc->sc_ifp)[4] = 1; IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid; error--; } if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { if (carpr.carpr_advskew >= 255) { error = EINVAL; break; } if (carpr.carpr_advbase > 255) { error = EINVAL; break; } sc->sc_advbase = carpr.carpr_advbase; sc->sc_advskew = carpr.carpr_advskew; error--; } bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); if (error > 0) error = EINVAL; else { error = 0; carp_setrun(sc, 0); } break; case SIOCGVH: /* XXX: lockless read */ bzero(&carpr, sizeof(carpr)); carpr.carpr_state = sc->sc_state; carpr.carpr_vhid = sc->sc_vhid; carpr.carpr_advbase = sc->sc_advbase; carpr.carpr_advskew = sc->sc_advskew; error = priv_check(curthread, PRIV_NETINET_CARP); if (error == 0) bcopy(sc->sc_key, carpr.carpr_key, sizeof(carpr.carpr_key)); error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); break; default: error = EINVAL; } if (locked) CARP_SCUNLOCK(sc); carp_hmac_prepare(sc); return (error); } /* * XXX: this is looutput. We should eventually use it from there. */ static int carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { u_int32_t af; M_ASSERTPKTHDR(m); /* check if we have the packet header */ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) { bcopy(dst->sa_data, &af, sizeof(af)); dst->sa_family = af; } #if 1 /* XXX */ switch (dst->sa_family) { case AF_INET: case AF_INET6: case AF_IPX: case AF_APPLETALK: break; default: printf("carp_looutput: af=%d unexpected\n", dst->sa_family); m_freem(m); return (EAFNOSUPPORT); } #endif return(if_simloop(ifp, m, dst->sa_family, 0)); } /* * Start output on carp interface. This function should never be called. */ static void carp_start(struct ifnet *ifp) { #ifdef DEBUG printf("%s: start called\n", ifp->if_xname); #endif } int carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct m_tag *mtag; struct carp_softc *sc; struct ifnet *carp_ifp; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif /* INET */ #ifdef INET6 case AF_INET6: break; #endif /* INET6 */ default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); sc = carp_ifp->if_softc; /* Set the source MAC address to Virtual Router MAC Address */ switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; case IFT_FDDI: { struct fddi_header *fh; fh = mtod(m, struct fddi_header *); fh->fddi_shost[0] = 0; fh->fddi_shost[1] = 0; fh->fddi_shost[2] = 0x5e; fh->fddi_shost[3] = 0; fh->fddi_shost[4] = 1; fh->fddi_shost[5] = sc->sc_vhid; } break; case IFT_ISO88025: { struct iso88025_header *th; th = mtod(m, struct iso88025_header *); th->iso88025_shost[0] = 3; th->iso88025_shost[1] = 0; th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); th->iso88025_shost[4] = 0; th->iso88025_shost[5] = 0; } break; default: printf("%s: carp is not supported for this interface type\n", ifp->if_xname); return (EOPNOTSUPP); } return (0); } static void carp_set_state(struct carp_softc *sc, int state) { if (sc->sc_carpdev) CARP_SCLOCK_ASSERT(sc); if (sc->sc_state == state) return; sc->sc_state = state; switch (state) { case BACKUP: SC2IFP(sc)->if_link_state = LINK_STATE_DOWN; break; case MASTER: SC2IFP(sc)->if_link_state = LINK_STATE_UP; break; default: SC2IFP(sc)->if_link_state = LINK_STATE_UNKNOWN; break; } rt_ifmsg(SC2IFP(sc)); } void carp_carpdev_state(void *v) { struct carp_if *cif = v; CARP_LOCK(cif); carp_carpdev_state_locked(cif); CARP_UNLOCK(cif); } static void carp_carpdev_state_locked(struct carp_if *cif) { struct carp_softc *sc; TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) carp_sc_state_locked(sc); } static void carp_sc_state_locked(struct carp_softc *sc) { CARP_SCLOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { sc->sc_flags_backup = SC2IFP(sc)->if_flags; SC2IFP(sc)->if_flags &= ~IFF_UP; SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->sc_ad_tmo); callout_stop(&sc->sc_md_tmo); callout_stop(&sc->sc_md6_tmo); carp_set_state(sc, INIT); carp_setrun(sc, 0); if (!sc->sc_suppress) { carp_suppress_preempt++; if (carp_suppress_preempt == 1) { CARP_SCUNLOCK(sc); carp_send_ad_all(); CARP_SCLOCK(sc); } } sc->sc_suppress = 1; } else { SC2IFP(sc)->if_flags |= sc->sc_flags_backup; carp_set_state(sc, INIT); carp_setrun(sc, 0); if (sc->sc_suppress) carp_suppress_preempt--; sc->sc_suppress = 0; } return; } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) return (ENOMEM); mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); LIST_INIT(&carpif_list); if_clone_attach(&carp_cloner); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if_clone_detach(&carp_cloner); mtx_destroy(&carp_mtx); break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/ip_dummynet.c =================================================================== --- head/sys/netinet/ip_dummynet.c (revision 171636) +++ head/sys/netinet/ip_dummynet.c (revision 171637) @@ -1,2206 +1,2206 @@ /*- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DUMMYNET_DEBUG #include "opt_inet6.h" /* * This module implements IP dummynet, a bandwidth limiter/delay emulator * used in conjunction with the ipfw package. * Description of the data structures used is in ip_dummynet.h * Here you mainly find the following blocks of code: * + variable declarations; * + heap management functions; * + scheduler and dummynet functions; * + configuration and initialization. * * NOTA BENE: critical sections are protected by the "dummynet lock". * * Most important Changes: * * 011004: KLDable * 010124: Fixed WF2Q behaviour * 010122: Fixed spl protection. * 000601: WF2Q support * 000106: large rewrite, use heaps to handle very many pipes. * 980513: initial release * * include files marked with XXX are probably not needed */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for struct arpcom */ #include /* for ip6_input, ip6_output prototypes */ #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) */ static dn_key curr_time = 0 ; /* current simulation time */ static int dn_hash_size = 64 ; /* default hash size */ /* statistics on number of queue searches and search steps */ static long searches, search_steps ; static int pipe_expire = 1 ; /* expire queue if empty */ static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ static struct timeval prev_t, t; static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ static long tick_delta_sum; /* Accumulated tick difference (usec).*/ static long tick_adjustment; /* Tick adjustments done. */ static long tick_lost; /* Lost(coalesced) ticks number. */ /* Adjusted vs non-adjusted curr_time difference (ticks). */ static long tick_diff; /* * Three heaps contain queues and pipes that the scheduler handles: * * ready_heap contains all dn_flow_queue related to fixed-rate pipes. * * wfq_ready_heap contains the pipes associated with WF2Q flows * * extract_heap contains pipes associated with delay lines. * */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; static int heap_init(struct dn_heap *h, int size); static int heap_insert (struct dn_heap *h, dn_key key1, void *p); static void heap_extract(struct dn_heap *h, void *obj); static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail); static void ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail); static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail); #define HASHSIZE 16 #define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ static struct callout dn_timeout; extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time, CTLFLAG_RD, &curr_time, 0, "Current tick"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches, CTLFLAG_RD, &searches, 0, "Number of queue searches"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps, CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, CTLFLAG_RW, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, CTLFLAG_RD, &tick_diff, 0, "Adjusted vs non-adjusted curr_time difference (ticks)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); #endif #ifdef DUMMYNET_DEBUG int dummynet_debug = 0; #ifdef SYSCTL_NODE SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, 0, "control debugging printfs"); #endif #define DPRINTF(X) if (dummynet_debug) printf X #else #define DPRINTF(X) #endif static struct task dn_task; static struct taskqueue *dn_tq = NULL; static void dummynet_task(void *, int); static struct mtx dummynet_mtx; #define DUMMYNET_LOCK_INIT() \ mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF) #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx) #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx) #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx) #define DUMMYNET_LOCK_ASSERT() do { \ mtx_assert(&dummynet_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); static void dummynet(void *); static void dummynet_flush(void); static void dummynet_send(struct mbuf *); void dummynet_drain(void); static ip_dn_io_t dummynet_io; static void dn_rule_delete(void *); /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * XXX failure to allocate a new element is a pretty bad failure * as we basically stall a whole queue forever!! * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( 2*(x) + 1 ) #define HEAP_IS_LEFT(x) ( (x) & 1 ) #define HEAP_RIGHT(x) ( 2*(x) + 2 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_init(struct dn_heap *h, int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) { printf("dummynet: %s, Bogus call, have %d want %d\n", __func__, h->size, new_size); return 0 ; } new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT); if (p == NULL) { printf("dummynet: %s, resize %d failed\n", __func__, new_size ); return 1 ; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DUMMYNET); } h->p = p ; h->size = new_size ; return 0 ; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If offset > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; /* * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. */ #define RESET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; static int heap_insert(struct dn_heap *h, dn_key key1, void *p) { int son = h->elements ; if (p == NULL) /* data already there, set starting point */ son = key1 ; else { /* insert new element at the end, possibly resize */ son = h->elements ; if (son == h->size) /* need resize... */ if (heap_init(h, h->elements+1) ) return 1 ; /* failure... */ h->p[son].object = p ; h->p[son].key = key1 ; h->elements++ ; } while (son > 0) { /* bubble up */ int father = HEAP_FATHER(son) ; struct dn_heap_entry tmp ; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break ; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp) ; SET_OFFSET(h, son); son = father ; } SET_OFFSET(h, son); return 0 ; } /* * remove top element from heap, or obj if obj != NULL */ static void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1 ; if (max < 0) { printf("dummynet: warning, extract from empty heap 0x%p\n", h); return ; } father = 0 ; /* default: move up smallest child */ if (obj != NULL) { /* extract specific element, index is at offset */ if (h->offset <= 0) panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); father = *((int *)((char *)obj + h->offset)) ; if (father < 0 || father >= h->elements) { printf("dummynet: heap_extract, father %d out of bound 0..%d\n", father, h->elements); panic("dummynet: heap_extract"); } } RESET_OFFSET(h, father); child = HEAP_LEFT(father) ; /* left child */ while (child <= max) { /* valid entry */ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child = child+1 ; /* take right child, otherwise left */ h->p[father] = h->p[child] ; SET_OFFSET(h, father); father = child ; child = HEAP_LEFT(child) ; /* left child for next loop */ } h->elements-- ; if (father != max) { /* * Fill hole with last entry and bubble up, reusing the insert code */ h->p[father] = h->p[max] ; heap_insert(h, father, NULL); /* this one cannot fail */ } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, dn_key new_key, void *object) { int temp; int i ; int max = h->elements-1 ; struct dn_heap_entry buf ; if (h->offset <= 0) panic("cannot move items on this heap"); i = *((int *)((char *)object + h->offset)); if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ h->p[i].key = new_key ; for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; i = temp ) { /* bubble up */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } } else { /* must move down */ h->p[i].key = new_key ; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) temp++ ; /* select child with min key */ if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } else break ; i = temp ; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i ; for (i = 0 ; i < h->elements ; i++ ) heap_insert(h, i , NULL) ; } /* * cleanup the heap and free data structure */ static void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DUMMYNET); bzero(h, sizeof(*h) ); } /* * --- end of heap management functions --- */ /* * Return the mbuf tag holding the dummynet state. As an optimization * this is assumed to be the first tag on the list. If this turns out * wrong we'll need to search the list. */ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } /* * Scheduler functions: * * transmit_event() is called when the delay-line needs to enter * the scheduler, either because of existing pkts getting ready, * or new packets entering the queue. The event handled is the delivery * time of the packet. * * ready_event() does something similar with fixed-rate queues, and the * event handled is the finish time of the head pkt. * * wfq_ready_event() does something similar with WF2Q queues, and the * event handled is the start time of the head pkt. * * In all cases, we make sure that the data structures are consistent * before passing pkts out, because this might trigger recursive * invocations of the procedures. */ static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) { struct mbuf *m; struct dn_pkt_tag *pkt; DUMMYNET_LOCK_ASSERT(); while ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); if (!DN_KEY_LEQ(pkt->output_time, curr_time)) break; pipe->head = m->m_nextpkt; if (*tail != NULL) (*tail)->m_nextpkt = m; else *head = m; *tail = m; } if (*tail != NULL) (*tail)->m_nextpkt = NULL; /* If there are leftover packets, put into the heap for next event. */ if ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); /* * XXX: Should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful. */ heap_insert(&extract_heap, pkt->output_time, pipe); } } /* * the following macro computes how many ticks we have to wait * before being able to transmit a packet. The credit is taken from * either a pipe (WF2Q) or a flow_queue (per-flow queueing) */ #define SET_TICKS(_m, q, p) \ ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ p->bandwidth ; /* * extract pkt from queue, compute output time (could be now) * and put into delay line (p_queue) */ static void move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p, int len) { struct dn_pkt_tag *dt = dn_tag_get(pkt); q->head = pkt->m_nextpkt ; q->len-- ; q->len_bytes -= len ; dt->output_time = curr_time + p->delay ; if (p->head == NULL) p->head = pkt; else p->tail->m_nextpkt = pkt; p->tail = pkt; p->tail->m_nextpkt = NULL; } /* * ready_event() is invoked every time the queue must enter the * scheduler, either because the first packet arrives, or because * a previously scheduled event fired. * On invokation, drain as many pkts as possible (could be 0) and then * if there are leftover packets reinsert the pkt in the scheduler. */ static void ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) { struct mbuf *pkt; struct dn_pipe *p = q->fs->pipe ; int p_was_empty ; DUMMYNET_LOCK_ASSERT(); if (p == NULL) { printf("dummynet: ready_event- pipe is gone\n"); return ; } p_was_empty = (p->head == NULL) ; /* * schedule fixed-rate queues linked to this pipe: * Account for the bw accumulated since last scheduling, then * drain as many pkts as allowed by q->numbytes and move to * the delay line (in p) computing output time. * bandwidth==0 (no limit) means we can drain the whole queue, * setting len_scaled = 0 does the job. */ q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; while ( (pkt = q->head) != NULL ) { int len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; if (len_scaled > q->numbytes ) break ; q->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); } /* * If we have more packets queued, schedule next ready event * (can only occur when bandwidth != 0, otherwise we would have * flushed the whole queue in the previous loop). * To this purpose we record the current time and compute how many * ticks to go for the finish time of the packet. */ if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ q->sched_time = curr_time ; heap_insert(&ready_heap, curr_time + t, (void *)q ); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } else { /* RED needs to know when the queue becomes empty */ q->q_time = curr_time; q->numbytes = 0; } /* * If the delay line was empty call transmit_event() now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p, head, tail); } /* * Called when we can transmit packets on WF2Q queues. Take pkts out of * the queues at their start time, and enqueue into the delay line. * Packets are drained until p->numbytes < 0. As long as * len_scaled >= p->numbytes, the packet goes into the delay line * with a deadline p->delay. For the last packet, if p->numbytes<0, * there is an additional delay. */ static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) { int p_was_empty = (p->head == NULL) ; struct dn_heap *sch = &(p->scheduler_heap); struct dn_heap *neh = &(p->not_eligible_heap) ; DUMMYNET_LOCK_ASSERT(); if (p->if_name[0] == 0) /* tx clock is simulated */ p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; else { /* tx clock is for real, the ifq must be empty or this is a NOP */ if (p->ifp && p->ifp->if_snd.ifq_head != NULL) return ; else { DPRINTF(("dummynet: pipe %d ready from %s --\n", p->pipe_nr, p->if_name)); } } /* * While we have backlogged traffic AND credit, we need to do * something on the queue. */ while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { if (sch->elements > 0) { /* have some eligible pkts to send out */ struct dn_flow_queue *q = sch->p[0].object ; struct mbuf *pkt = q->head; struct dn_flow_set *fs = q->fs; u_int64_t len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; heap_extract(sch, NULL); /* remove queue from heap */ p->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); p->V += (len<sum ; /* update V */ q->S = q->F ; /* update start time */ if (q->len == 0) { /* Flow not backlogged any more */ fs->backlogged-- ; heap_insert(&(p->idle_heap), q->F, q); } else { /* still backlogged */ /* * update F and position in backlogged queue, then * put flow in not_eligible_heap (we will fix this later). */ len = (q->head)->m_pkthdr.len; q->F += (len<weight ; if (DN_KEY_LEQ(q->S, p->V)) heap_insert(neh, q->S, q); else heap_insert(sch, q->F, q); } } /* * now compute V = max(V, min(S_i)). Remember that all elements in sch * have by definition S_i <= V so if sch is not empty, V is surely * the max and we must not update it. Conversely, if sch is empty * we only need to look at neh. */ if (sch->elements == 0 && neh->elements > 0) p->V = MAX64 ( p->V, neh->p[0].key ); /* move from neh to sch any packets that have become eligible */ while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { struct dn_flow_queue *q = neh->p[0].object ; heap_extract(neh, NULL); heap_insert(sch, q->F, q); } if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ p->numbytes = -1 ; /* mark not ready for I/O */ break ; } } if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 && p->idle_heap.elements > 0) { /* * no traffic and no events scheduled. We can get rid of idle-heap. */ int i ; for (i = 0 ; i < p->idle_heap.elements ; i++) { struct dn_flow_queue *q = p->idle_heap.p[i].object ; q->F = 0 ; q->S = q->F + 1 ; } p->sum = 0 ; p->V = 0 ; p->idle_heap.elements = 0 ; } /* * If we are getting clocks from dummynet (not a real interface) and * If we are under credit, schedule the next ready event. * Also fix the delivery time of the last packet. */ if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ dn_key t=0 ; /* number of ticks i have to wait */ if (p->bandwidth > 0) t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; dn_tag_get(p->tail)->output_time += t ; p->sched_time = curr_time ; heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } /* * If the delay line was empty call transmit_event() now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p, head, tail); } /* * This is called one tick, after previous run. It is used to * schedule next run. */ static void dummynet(void * __unused unused) { taskqueue_enqueue(dn_tq, &dn_task); } /* * The main dummynet processing function. */ static void dummynet_task(void *context, int pending) { struct mbuf *head = NULL, *tail = NULL; struct dn_pipe *pipe; struct dn_heap *heaps[3]; struct dn_heap *h; void *p; /* generic parameter to handler */ int i; NET_LOCK_GIANT(); DUMMYNET_LOCK(); heaps[0] = &ready_heap; /* fixed-rate queues */ heaps[1] = &wfq_ready_heap; /* wfq queues */ heaps[2] = &extract_heap; /* delay line */ /* Update number of lost(coalesced) ticks. */ tick_lost += pending - 1; getmicrouptime(&t); /* Last tick duration (usec). */ tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 + (t.tv_usec - prev_t.tv_usec); /* Last tick vs standard tick difference (usec). */ tick_delta = (tick_last * hz - 1000000) / hz; /* Accumulated tick difference (usec). */ tick_delta_sum += tick_delta; prev_t = t; /* * Adjust curr_time if accumulated tick difference greater than * 'standard' tick. Since curr_time should be monotonically increasing, * we do positive adjustment as required and throttle curr_time in * case of negative adjustment. */ curr_time++; if (tick_delta_sum - tick >= 0) { int diff = tick_delta_sum / tick; curr_time += diff; tick_diff += diff; tick_delta_sum %= tick; tick_adjustment++; } else if (tick_delta_sum + tick <= 0) { curr_time--; tick_diff--; tick_delta_sum += tick; tick_adjustment++; } for (i = 0; i < 3; i++) { h = heaps[i]; while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) { if (h->p[0].key > curr_time) printf("dummynet: warning, " "heap %d is %d ticks late\n", i, (int)(curr_time - h->p[0].key)); /* store a copy before heap_extract */ p = h->p[0].object; /* need to extract before processing */ heap_extract(h, NULL); if (i == 0) ready_event(p, &head, &tail); else if (i == 1) { struct dn_pipe *pipe = p; if (pipe->if_name[0] != '\0') printf("dummynet: bad ready_event_wfq " "for pipe %s\n", pipe->if_name); else ready_event_wfq(p, &head, &tail); } else transmit_event(p, &head, &tail); } } /* Sweep pipes trying to expire idle flow_queues. */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(pipe, &pipehash[i], next) if (pipe->idle_heap.elements > 0 && DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) { struct dn_flow_queue *q = pipe->idle_heap.p[0].object; heap_extract(&(pipe->idle_heap), NULL); /* Mark timestamp as invalid. */ q->S = q->F + 1; pipe->sum -= q->fs->weight; } DUMMYNET_UNLOCK(); if (head != NULL) dummynet_send(head); callout_reset(&dn_timeout, 1, dummynet, NULL); NET_UNLOCK_GIANT(); } static void dummynet_send(struct mbuf *m) { struct dn_pkt_tag *pkt; struct mbuf *n; struct ip *ip; for (; m != NULL; m = n) { n = m->m_nextpkt; m->m_nextpkt = NULL; pkt = dn_tag_get(m); switch (pkt->dn_dir) { case DN_TO_IP_OUT: ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); break ; case DN_TO_IP_IN : ip = mtod(m, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); netisr_dispatch(NETISR_IP, m); break; #ifdef INET6 case DN_TO_IP6_IN: netisr_dispatch(NETISR_IPV6, m); break; case DN_TO_IP6_OUT: ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); break; #endif case DN_TO_IFB_FWD: if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, pkt->ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DN_TO_ETH_DEMUX: /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. * Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup failed, " "dropping packet\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); break; case DN_TO_ETH_OUT: ether_output_frame(pkt->ifp, m); break; default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(m); break; } } } /* * Unconditionally expire empty queues in case of shortage. * Returns the number of queues freed. */ static int expire_queues(struct dn_flow_set *fs) { struct dn_flow_queue *q, *prev ; int i, initial_elements = fs->rq_elements ; if (fs->last_expired == time_uptime) return 0 ; fs->last_expired = time_uptime ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) if (q->head != NULL || q->S != q->F+1) { prev = q ; q = q->next ; } else { /* entry is idle, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); } return initial_elements - fs->rq_elements ; } /* * If room, create a new queue and put at head of slot i; * otherwise, create or use the default queue. */ static struct dn_flow_queue * create_queue(struct dn_flow_set *fs, int i) { struct dn_flow_queue *q ; if (fs->rq_elements > fs->rq_size * dn_max_ratio && expire_queues(fs) == 0) { /* * No way to get room, use or create overflow queue. */ i = fs->rq_size ; if ( fs->rq[i] != NULL ) return fs->rq[i] ; } q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO); if (q == NULL) { printf("dummynet: sorry, cannot allocate queue for new flow\n"); return NULL ; } q->fs = fs ; q->hash_slot = i ; q->next = fs->rq[i] ; q->S = q->F + 1; /* hack - mark timestamp as invalid */ fs->rq[i] = q ; fs->rq_elements++ ; return q ; } /* * Given a flow_set and a pkt in last_pkt, find a matching queue * after appropriate masking. The queue is moved to front * so that further searches take less time. */ static struct dn_flow_queue * find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; int is_v6 = IS_IP6_FLOW_ID(id); if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) q = fs->rq[0] ; else { /* first, do the masking, then hash */ id->dst_port &= fs->flow_mask.dst_port ; id->src_port &= fs->flow_mask.src_port ; id->proto &= fs->flow_mask.proto ; id->flags = 0 ; /* we don't care about this one */ if (is_v6) { APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); id->flow_id6 &= fs->flow_mask.flow_id6; i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto ) ^ (id->flow_id6); } else { id->dst_ip &= fs->flow_mask.dst_ip ; id->src_ip &= fs->flow_mask.src_ip ; i = ( (id->dst_ip) & 0xffff ) ^ ( (id->dst_ip >> 15) & 0xffff ) ^ ( (id->src_ip << 1) & 0xffff ) ^ ( (id->src_ip >> 16 ) & 0xffff ) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto ); } i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; if (is_v6 && IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && id->dst_port == q->id.dst_port && id->src_port == q->id.src_port && id->proto == q->id.proto && id->flags == q->id.flags && id->flow_id6 == q->id.flow_id6) break ; /* found */ if (!is_v6 && id->dst_ip == q->id.dst_ip && id->src_ip == q->id.src_ip && id->dst_port == q->id.dst_port && id->src_port == q->id.src_port && id->proto == q->id.proto && id->flags == q->id.flags) break ; /* found */ /* No match. Check if we can expire the entry */ if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { /* entry is idle and not in any heap, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); continue ; } prev = q ; q = q->next ; } if (q && prev != NULL) { /* found and not in front */ prev->next = q->next ; q->next = fs->rq[i] ; fs->rq[i] = q ; } } if (q == NULL) { /* no match, need to allocate a new entry */ q = create_queue(fs, i); if (q != NULL) q->id = *id ; } return q ; } static int red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. */ int64_t p_b = 0; /* Queue in bytes or packets? */ u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size)); /* Average queue size estimation. */ if (q_size != 0) { /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); q->avg += (int)v; } else { /* * Queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = (curr_time - q->q_time) / fs->lookup_step; q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); /* Should i drop? */ if (q->avg < fs->min_th) { q->count = -1; return (0); /* accept packet */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->flags_fs & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than * max_th the packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th * c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - fs->c_4; } else { q->count = -1; DPRINTF(("dummynet: - drop")); return (1); } } else if (q->avg > fs->min_th) { /* * We compute p_b using the linear dropping function * p_b = c_1 * avg - c_2 * where c_1 = max_p / (max_th - min_th) * c_2 = max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; } if (fs->flags_fs & DN_QSIZE_IS_BYTES) p_b = (p_b * len) / fs->max_pkt_size; if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { q->count = 0; DPRINTF(("dummynet: - red drop")); /* After a drop we calculate a new random value. */ q->random = random() & 0xffff; return (1); /* drop */ } } /* End of RED algorithm. */ return (0); /* accept */ } static __inline struct dn_flow_set * locate_flowset(int fs_nr) { struct dn_flow_set *fs; SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) if (fs->fs_nr == fs_nr) return (fs); return (NULL); } static __inline struct dn_pipe * locate_pipe(int pipe_nr) { struct dn_pipe *pipe; SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) if (pipe->pipe_nr == pipe_nr) return (pipe); return (NULL); } /* * dummynet hook for packets. Below 'pipe' is a pipe or a queue * depending on whether WF2Q or fixed bw is used. * * pipe_nr pipe or queue the packet is destined for. * dir where shall we send the packet after dummynet. * m the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, * rule matching rule, in case of multiple passes * */ static int dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct mbuf *head = NULL, *tail = NULL; struct dn_pkt_tag *pkt; struct m_tag *mtag; struct dn_flow_set *fs = NULL; struct dn_pipe *pipe ; u_int64_t len = m->m_pkthdr.len ; struct dn_flow_queue *q = NULL ; int is_pipe; ipfw_insn *cmd = ACTION_PTR(fwa->rule); KASSERT(m->m_nextpkt == NULL, ("dummynet_io: mbuf queue passed to dummynet")); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); if (cmd->opcode == O_ALTQ) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); is_pipe = (cmd->opcode == O_PIPE); DUMMYNET_LOCK(); /* * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. * * XXXGL: probably the pipe->fs and fs->pipe logic here * below can be simplified. */ if (is_pipe) { pipe = locate_pipe(fwa->cookie); if (pipe != NULL) fs = &(pipe->fs); } else fs = locate_flowset(fwa->cookie); if (fs == NULL) goto dropit; /* This queue/pipe does not exist! */ pipe = fs->pipe; if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */ pipe = locate_pipe(fs->parent_nr); if (pipe != NULL) fs->pipe = pipe; else { printf("dummynet: no pipe %d for queue %d, drop pkt\n", fs->parent_nr, fs->fs_nr); goto dropit ; } } q = find_queue(fs, &(fwa->f_id)); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ /* * update statistics, then check reasons to drop pkt */ q->tot_bytes += len ; q->tot_pkts++ ; if ( fs->plr && random() < fs->plr ) goto dropit ; /* random pkt drop */ if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { if (q->len_bytes > fs->qsize) goto dropit ; /* queue size overflow */ } else { if (q->len >= fs->qsize) goto dropit ; /* queue count overflow */ } if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) goto dropit ; /* XXX expensive to zero, see if we can remove it*/ mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO); if ( mtag == NULL ) goto dropit ; /* cannot allocate packet header */ m_tag_prepend(m, mtag); /* attach to mbuf chain */ pkt = (struct dn_pkt_tag *)(mtag+1); /* ok, i can handle the pkt now... */ /* build and enqueue packet + parameters */ pkt->rule = fwa->rule ; pkt->dn_dir = dir ; pkt->ifp = fwa->oif; if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->tail = m; q->len++; q->len_bytes += len ; if ( q->head != m ) /* flow was not idle, we are done */ goto done; /* * If we reach this point the flow was previously idle, so we need * to schedule it. This involves different actions for fixed-rate or * WF2Q queues. */ if (is_pipe) { /* * Fixed-rate queue: just insert into the ready_heap. */ dn_key t = 0 ; if (pipe->bandwidth) t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; if (t == 0) /* must process it now */ ready_event(q, &head, &tail); else heap_insert(&ready_heap, curr_time + t , q ); } else { /* * WF2Q. First, compute start time S: if the flow was idle (S=F+1) * set S to the virtual time V for the controlling pipe, and update * the sum of weights for the pipe; otherwise, remove flow from * idle_heap and set S to max(F,V). * Second, compute finish time F = S + len/weight. * Third, if pipe was idle, update V=max(S, V). * Fourth, count one more backlogged flow. */ if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ q->S = pipe->V ; pipe->sum += fs->weight ; /* add weight of new queue */ } else { heap_extract(&(pipe->idle_heap), q); q->S = MAX64(q->F, pipe->V ) ; } q->F = q->S + ( len<weight; if (pipe->not_eligible_heap.elements == 0 && pipe->scheduler_heap.elements == 0) pipe->V = MAX64 ( q->S, pipe->V ); fs->backlogged++ ; /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the scheduler_heap cannot be * empty). If the flow is not eligible we just store it in the * not_eligible_heap. Otherwise, we store in the scheduler_heap * and possibly invoke ready_event_wfq() right now if there is * leftover credit. * Note that for all flows in scheduler_heap (SCH), S_i <= V, * and for all flows in not_eligible_heap (NEH), S_i > V . * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, * we only need to look into NEH. */ if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ if (pipe->scheduler_heap.elements == 0) printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); heap_insert(&(pipe->not_eligible_heap), q->S, q); } else { heap_insert(&(pipe->scheduler_heap), q->F, q); if (pipe->numbytes >= 0) { /* pipe is idle */ if (pipe->scheduler_heap.elements != 1) printf("dummynet: OUCH! pipe should have been idle!\n"); DPRINTF(("dummynet: waking up pipe %d at %d\n", pipe->pipe_nr, (int)(q->F >> MY_M))); pipe->sched_time = curr_time ; ready_event_wfq(pipe, &head, &tail); } } } done: DUMMYNET_UNLOCK(); if (head != NULL) dummynet_send(head); return 0; dropit: if (q) q->drops++ ; DUMMYNET_UNLOCK(); m_freem(m); return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); } /* * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) * Doing this would probably save us the initial bzero of dn_pkt */ #define DN_FREE_PKT(_m) do { \ m_freem(_m); \ } while (0) /* * Dispose all packets and flow_queues on a flow_set. * If all=1, also remove red lookup table and other storage, * including the descriptor itself. * For the one in dn_pipe MUST also cleanup ready_heap... */ static void purge_flow_set(struct dn_flow_set *fs, int all) { struct dn_flow_queue *q, *qn; int i; DUMMYNET_LOCK_ASSERT(); for (i = 0; i <= fs->rq_size; i++) { for (q = fs->rq[i]; q != NULL; q = qn) { struct mbuf *m, *mnext; mnext = q->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } qn = q->next; free(q, M_DUMMYNET); } fs->rq[i] = NULL; } fs->rq_elements = 0; if (all) { /* RED - free lookup table. */ if (fs->w_q_lookup != NULL) free(fs->w_q_lookup, M_DUMMYNET); if (fs->rq != NULL) free(fs->rq, M_DUMMYNET); /* If this fs is not part of a pipe, free it. */ if (fs->pipe == NULL || fs != &(fs->pipe->fs)) free(fs, M_DUMMYNET); } } /* * Dispose all packets queued on a pipe (not a flow_set). * Also free all resources associated to a pipe, which is about * to be deleted. */ static void purge_pipe(struct dn_pipe *pipe) { struct mbuf *m, *mnext; purge_flow_set( &(pipe->fs), 1 ); mnext = pipe->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } heap_free( &(pipe->scheduler_heap) ); heap_free( &(pipe->not_eligible_heap) ); heap_free( &(pipe->idle_heap) ); } /* * Delete all pipes and heaps returning memory. Must also * remove references from all ipfw rules to all pipes. */ static void dummynet_flush(void) { struct dn_pipe *pipe, *pipe1; struct dn_flow_set *fs, *fs1; int i; DUMMYNET_LOCK(); /* Free heaps so we don't have unwanted events. */ heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); /* * Now purge all queued pkts and delete all pipes. * * XXXGL: can we merge the for(;;) cycles into one or not? */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) { SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next); purge_flow_set(fs, 1); } for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) { SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next); purge_pipe(pipe); free(pipe, M_DUMMYNET); } DUMMYNET_UNLOCK(); } extern struct ip_fw *ip_fw_default_rule ; static void dn_rule_delete_fs(struct dn_flow_set *fs, void *r) { int i ; struct dn_flow_queue *q ; struct mbuf *m ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ for (q = fs->rq[i] ; q ; q = q->next ) for (m = q->head ; m ; m = m->m_nextpkt ) { struct dn_pkt_tag *pkt = dn_tag_get(m) ; if (pkt->rule == r) pkt->rule = ip_fw_default_rule ; } } /* * when a firewall rule is deleted, scan all queues and remove the flow-id * from packets matching this rule. */ void dn_rule_delete(void *r) { struct dn_pipe *pipe; struct dn_flow_set *fs; struct dn_pkt_tag *pkt; struct mbuf *m; int i; DUMMYNET_LOCK(); /* * If the rule references a queue (dn_flow_set), then scan * the flow set, otherwise scan pipes. Should do either, but doing * both does not harm. */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(fs, &flowsethash[i], next) dn_rule_delete_fs(fs, r); for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(pipe, &pipehash[i], next) { fs = &(pipe->fs); dn_rule_delete_fs(fs, r); for (m = pipe->head ; m ; m = m->m_nextpkt ) { pkt = dn_tag_get(m); if (pkt->rule == r) pkt->rule = ip_fw_default_rule; } } DUMMYNET_UNLOCK(); } /* * setup RED parameters */ static int config_red(struct dn_flow_set *p, struct dn_flow_set *x) { int i; x->w_q = p->w_q; x->min_th = SCALE(p->min_th); x->max_th = SCALE(p->max_th); x->max_p = p->max_p; x->c_1 = p->max_p / (p->max_th - p->min_th); x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); if (x->flags_fs & DN_IS_GENTLE_RED) { x->c_3 = (SCALE(1) - p->max_p) / p->max_th; x->c_4 = SCALE(1) - 2 * p->max_p; } /* If the lookup table already exist, free and create it again. */ if (x->w_q_lookup) { free(x->w_q_lookup, M_DUMMYNET); x->w_q_lookup = NULL; } if (red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" "must be > 0\n"); free(x, M_DUMMYNET); return (EINVAL); } x->lookup_depth = red_lookup_depth; x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); if (x->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); free(x, M_DUMMYNET); return(ENOSPC); } /* Fill the lookup table with (1 - w_q)^x */ x->lookup_step = p->lookup_step; x->lookup_weight = p->lookup_weight; x->w_q_lookup[0] = SCALE(1) - x->w_q; for (i = 1; i < x->lookup_depth; i++) x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); if (red_avg_pkt_size < 1) red_avg_pkt_size = 512; x->avg_pkt_size = red_avg_pkt_size; if (red_max_pkt_size < 1) red_max_pkt_size = 1500; x->max_pkt_size = red_max_pkt_size; return (0); } static int alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) { if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ int l = pfs->rq_size; if (l == 0) l = dn_hash_size; if (l < 4) l = 4; else if (l > DN_MAX_HASH_SIZE) l = DN_MAX_HASH_SIZE; x->rq_size = l; } else /* one is enough for null mask */ x->rq_size = 1; x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), M_DUMMYNET, M_NOWAIT | M_ZERO); if (x->rq == NULL) { printf("dummynet: sorry, cannot allocate queue\n"); return (ENOMEM); } x->rq_elements = 0; return 0 ; } static void set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) { x->flags_fs = src->flags_fs; x->qsize = src->qsize; x->plr = src->plr; x->flow_mask = src->flow_mask; if (x->flags_fs & DN_QSIZE_IS_BYTES) { if (x->qsize > 1024 * 1024) x->qsize = 1024 * 1024; } else { if (x->qsize == 0) x->qsize = 50; if (x->qsize > 100) x->qsize = 50; } /* Configuring RED. */ if (x->flags_fs & DN_IS_RED) config_red(src, x); /* XXX should check errors */ } /* * Setup pipe or queue parameters. */ static int config_pipe(struct dn_pipe *p) { struct dn_flow_set *pfs = &(p->fs); struct dn_flow_queue *q; int i, error; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes */ p->delay = (p->delay * hz) / 1000; /* We need either a pipe number or a flow_set number. */ if (p->pipe_nr == 0 && pfs->fs_nr == 0) return (EINVAL); if (p->pipe_nr != 0 && pfs->fs_nr != 0) return (EINVAL); if (p->pipe_nr != 0) { /* this is a pipe */ struct dn_pipe *pipe; DUMMYNET_LOCK(); pipe = locate_pipe(p->pipe_nr); /* locate pipe */ if (pipe == NULL) { /* new pipe */ pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_NOWAIT | M_ZERO); if (pipe == NULL) { DUMMYNET_UNLOCK(); printf("dummynet: no memory for new pipe\n"); return (ENOMEM); } pipe->pipe_nr = p->pipe_nr; pipe->fs.pipe = pipe; /* * idle_heap is the only one from which * we extract from the middle. */ pipe->idle_heap.size = pipe->idle_heap.elements = 0; pipe->idle_heap.offset = offsetof(struct dn_flow_queue, heap_pos); } else /* Flush accumulated credit for all queues. */ for (i = 0; i <= pipe->fs.rq_size; i++) for (q = pipe->fs.rq[i]; q; q = q->next) q->numbytes = 0; pipe->bandwidth = p->bandwidth; pipe->numbytes = 0; /* just in case... */ bcopy(p->if_name, pipe->if_name, sizeof(p->if_name)); pipe->ifp = NULL; /* reset interface ptr */ pipe->delay = p->delay; set_fs_parms(&(pipe->fs), pfs); if (pipe->fs.rq == NULL) { /* a new pipe */ error = alloc_hash(&(pipe->fs), pfs); if (error) { DUMMYNET_UNLOCK(); free(pipe, M_DUMMYNET); return (error); } SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)], pipe, next); } DUMMYNET_UNLOCK(); } else { /* config queue */ struct dn_flow_set *fs; DUMMYNET_LOCK(); fs = locate_flowset(pfs->fs_nr); /* locate flow_set */ if (fs == NULL) { /* new */ if (pfs->parent_nr == 0) { /* need link to a pipe */ DUMMYNET_UNLOCK(); return (EINVAL); } fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET, M_NOWAIT | M_ZERO); if (fs == NULL) { DUMMYNET_UNLOCK(); printf( "dummynet: no memory for new flow_set\n"); return (ENOMEM); } fs->fs_nr = pfs->fs_nr; fs->parent_nr = pfs->parent_nr; fs->weight = pfs->weight; if (fs->weight == 0) fs->weight = 1; else if (fs->weight > 100) fs->weight = 100; } else { /* * Change parent pipe not allowed; * must delete and recreate. */ if (pfs->parent_nr != 0 && fs->parent_nr != pfs->parent_nr) { DUMMYNET_UNLOCK(); return (EINVAL); } } set_fs_parms(fs, pfs); if (fs->rq == NULL) { /* a new flow_set */ error = alloc_hash(fs, pfs); if (error) { DUMMYNET_UNLOCK(); free(fs, M_DUMMYNET); return (error); } SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)], fs, next); } DUMMYNET_UNLOCK(); } return (0); } /* * Helper function to remove from a heap queues which are linked to * a flow_set about to be deleted. */ static void fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) { int i = 0, found = 0 ; for (; i < h->elements ;) if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (found) heapify(h); } /* * helper function to remove a pipe from a heap (can be there at most once) */ static void pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) { if (h->elements > 0) { int i = 0 ; for (i=0; i < h->elements ; i++ ) { if (h->p[i].object == p) { /* found it */ h->elements-- ; h->p[i] = h->p[h->elements] ; heapify(h); break ; } } } } /* * drain all queues. Called in case of severe mbuf shortage. */ void dummynet_drain(void) { struct dn_flow_set *fs; struct dn_pipe *pipe; struct mbuf *m, *mnext; int i; DUMMYNET_LOCK_ASSERT(); heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); /* remove all references to this pipe from flow_sets */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(fs, &flowsethash[i], next) purge_flow_set(fs, 0); for (i = 0; i < HASHSIZE; i++) { SLIST_FOREACH(pipe, &pipehash[i], next) { purge_flow_set(&(pipe->fs), 0); mnext = pipe->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } pipe->head = pipe->tail = NULL; } } } /* * Fully delete a pipe or a queue, cleaning up associated info. */ static int delete_pipe(struct dn_pipe *p) { if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) return EINVAL ; if (p->pipe_nr != 0) { /* this is an old-style pipe */ struct dn_pipe *pipe; struct dn_flow_set *fs; int i; DUMMYNET_LOCK(); pipe = locate_pipe(p->pipe_nr); /* locate pipe */ if (pipe == NULL) { DUMMYNET_UNLOCK(); return (ENOENT); /* not found */ } /* Unlink from list of pipes. */ SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next); /* Remove all references to this pipe from flow_sets. */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(fs, &flowsethash[i], next) if (fs->pipe == pipe) { printf("dummynet: ++ ref to pipe %d from fs %d\n", p->pipe_nr, fs->fs_nr); fs->pipe = NULL ; purge_flow_set(fs, 0); } fs_remove_from_heap(&ready_heap, &(pipe->fs)); purge_pipe(pipe); /* remove all data associated to this pipe */ /* remove reference to here from extract_heap and wfq_ready_heap */ pipe_remove_from_heap(&extract_heap, pipe); pipe_remove_from_heap(&wfq_ready_heap, pipe); DUMMYNET_UNLOCK(); free(pipe, M_DUMMYNET); } else { /* this is a WF2Q queue (dn_flow_set) */ struct dn_flow_set *fs; DUMMYNET_LOCK(); fs = locate_flowset(p->fs.fs_nr); /* locate set */ if (fs == NULL) { DUMMYNET_UNLOCK(); return (ENOENT); /* not found */ } /* Unlink from list of flowsets. */ SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next); if (fs->pipe != NULL) { /* Update total weight on parent pipe and cleanup parent heaps. */ fs->pipe->sum -= fs->weight * fs->backlogged ; fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs); fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs); #if 1 /* XXX should i remove from idle_heap as well ? */ fs_remove_from_heap(&(fs->pipe->idle_heap), fs); #endif } purge_flow_set(fs, 1); DUMMYNET_UNLOCK(); } return 0 ; } /* * helper function used to copy data from kernel in DUMMYNET_GET */ static char * dn_copy_set(struct dn_flow_set *set, char *bp) { int i, copied = 0 ; struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; DUMMYNET_LOCK_ASSERT(); for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) printf("dummynet: ++ at %d: wrong slot (have %d, " "should be %d)\n", copied, q->hash_slot, i); if (q->fs != set) printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", i, q->fs, set); copied++ ; bcopy(q, qp, sizeof( *q ) ); /* cleanup pointers */ qp->next = NULL ; qp->head = qp->tail = NULL ; qp->fs = NULL ; } if (copied != set->rq_elements) printf("dummynet: ++ wrong count, have %d should be %d\n", copied, set->rq_elements); return (char *)qp ; } static size_t dn_calc_size(void) { struct dn_flow_set *fs; struct dn_pipe *pipe; size_t size = 0; int i; DUMMYNET_LOCK_ASSERT(); /* * Compute size of data structures: list of pipes and flow_sets. */ for (i = 0; i < HASHSIZE; i++) { SLIST_FOREACH(pipe, &pipehash[i], next) size += sizeof(*pipe) + pipe->fs.rq_elements * sizeof(struct dn_flow_queue); SLIST_FOREACH(fs, &flowsethash[i], next) size += sizeof (*fs) + fs->rq_elements * sizeof(struct dn_flow_queue); } return size; } static int dummynet_get(struct sockopt *sopt) { char *buf, *bp ; /* bp is the "copy-pointer" */ size_t size ; struct dn_flow_set *fs; struct dn_pipe *pipe; int error=0, i ; /* XXX lock held too long */ DUMMYNET_LOCK(); /* * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we * cannot use this flag while holding a mutex. */ for (i = 0; i < 10; i++) { size = dn_calc_size(); DUMMYNET_UNLOCK(); buf = malloc(size, M_TEMP, M_WAITOK); DUMMYNET_LOCK(); if (size == dn_calc_size()) break; free(buf, M_TEMP); buf = NULL; } if (buf == NULL) { DUMMYNET_UNLOCK(); return ENOBUFS ; } bp = buf; for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(pipe, &pipehash[i], next) { struct dn_pipe *pipe_bp = (struct dn_pipe *)bp; /* * Copy pipe descriptor into *bp, convert delay back to ms, * then copy the flow_set descriptor(s) one at a time. * After each flow_set, copy the queue descriptor it owns. */ bcopy(pipe, bp, sizeof(*pipe)); pipe_bp->delay = (pipe_bp->delay * 1000) / hz; /* * XXX the following is a hack based on ->next being the * first field in dn_pipe and dn_flow_set. The correct * solution would be to move the dn_flow_set to the beginning * of struct dn_pipe. */ pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE; /* Clean pointers. */ pipe_bp->head = pipe_bp->tail = NULL; pipe_bp->fs.next.sle_next = NULL; pipe_bp->fs.pipe = NULL; pipe_bp->fs.rq = NULL; bp += sizeof(*pipe) ; bp = dn_copy_set(&(pipe->fs), bp); } for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(fs, &flowsethash[i], next) { struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp; bcopy(fs, bp, sizeof(*fs)); /* XXX same hack as above */ fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; fs_bp->pipe = NULL; fs_bp->rq = NULL; bp += sizeof(*fs); bp = dn_copy_set(fs, bp); } DUMMYNET_UNLOCK(); error = sooptcopyout(sopt, buf, size); free(buf, M_TEMP); return error ; } /* * Handler for the various dummynet socket options (get, flush, config, del) */ static int ip_dn_ctl(struct sockopt *sopt) { int error = 0 ; struct dn_pipe *p, tmp_pipe; /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET) { #if __FreeBSD_version >= 500034 error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); #else if (securelevel >= 3) return (EPERM); #endif } switch (sopt->sopt_name) { default : printf("dummynet: -- unknown option %d", sopt->sopt_name); return EINVAL ; case IP_DUMMYNET_GET : error = dummynet_get(sopt); break ; case IP_DUMMYNET_FLUSH : dummynet_flush() ; break ; case IP_DUMMYNET_CONFIGURE : p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = config_pipe(p); break ; case IP_DUMMYNET_DEL : /* remove a pipe or queue */ p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = delete_pipe(p); break ; } return error ; } static void ip_dn_init(void) { int i; if (bootverbose) printf("DUMMYNET with IPv6 initialized (040826)\n"); DUMMYNET_LOCK_INIT(); for (i = 0; i < HASHSIZE; i++) { SLIST_INIT(&pipehash[i]); SLIST_INIT(&flowsethash[i]); } ready_heap.size = ready_heap.elements = 0; ready_heap.offset = 0; wfq_ready_heap.size = wfq_ready_heap.elements = 0; wfq_ready_heap.offset = 0; extract_heap.size = extract_heap.elements = 0; extract_heap.offset = 0; ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; ip_dn_ruledel_ptr = dn_rule_delete; TASK_INIT(&dn_task, 0, dummynet_task, NULL); dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT, taskqueue_thread_enqueue, &dn_tq); taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); - callout_init(&dn_timeout, NET_CALLOUT_MPSAFE); + callout_init(&dn_timeout, CALLOUT_MPSAFE); callout_reset(&dn_timeout, 1, dummynet, NULL); /* Initialize curr_time adjustment mechanics. */ getmicrouptime(&prev_t); } #ifdef KLD_MODULE static void ip_dn_destroy(void) { ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; ip_dn_ruledel_ptr = NULL; DUMMYNET_LOCK(); callout_stop(&dn_timeout); DUMMYNET_UNLOCK(); taskqueue_drain(dn_tq, &dn_task); taskqueue_free(dn_tq); dummynet_flush(); DUMMYNET_LOCK_DESTROY(); } #endif /* KLD_MODULE */ static int dummynet_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if (DUMMYNET_LOADED) { printf("DUMMYNET already loaded\n"); return EEXIST ; } ip_dn_init(); break; case MOD_UNLOAD: #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(); #endif break ; default: return EOPNOTSUPP; break ; } return 0 ; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL }; DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 1); Index: head/sys/netinet/ip_fw2.c =================================================================== --- head/sys/netinet/ip_fw2.c (revision 171636) +++ head/sys/netinet/ip_fw2.c (revision 171637) @@ -1,5056 +1,5056 @@ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DEB(x) #define DDB(x) x /* * Implement IP packet firewall (new version) */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPFIREWALL_NAT #include #include #endif #include #include #include #include #ifdef INET6 #include #endif #include /* XXX for ETHERTYPE_IP */ #include /* XXX for in_cksum */ #include /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 /* * Data structure to cache our ucred related * information. This structure only gets used if * the user specified UID/GID based constraints in * a firewall rule. */ struct ip_fw_ugid { gid_t fw_groups[NGROUPS]; int fw_ngroups; uid_t fw_uid; int fw_prid; }; #define IPFW_TABLES_MAX 128 struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ struct ip_fw *reap; /* list of rules to reap */ LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ struct radix_node_head *tables[IPFW_TABLES_MAX]; struct rwlock rwmtx; }; #define IPFW_LOCK_INIT(_chain) \ rw_init(&(_chain)->rwmtx, "IPFW static rules") #define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx) #define IPFW_WLOCK_ASSERT(_chain) do { \ rw_assert(&(_chain)->rwmtx, RA_WLOCKED); \ NET_ASSERT_GIANT(); \ } while (0) #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) /* * list of rules for layer 3 */ static struct ip_fw_chain layer3_chain; MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; static int fw_debug = 1; static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, ipfw_chg_hook, "I", "Enable ipfw"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_verbose, 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ static ipfw_dyn_rule **ipfw_dyn_v = NULL; static u_int32_t dyn_buckets = 256; /* must be power of 2 */ static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) /* * Timeouts for various events in handing dynamic rules. */ static u_int32_t dyn_ack_lifetime = 300; static u_int32_t dyn_syn_lifetime = 20; static u_int32_t dyn_fin_lifetime = 1; static u_int32_t dyn_rst_lifetime = 1; static u_int32_t dyn_udp_lifetime = 10; static u_int32_t dyn_short_lifetime = 5; /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static u_int32_t dyn_keepalive_interval = 20; static u_int32_t dyn_keepalive_period = 5; static u_int32_t dyn_keepalive = 1; /* do send keepalives */ static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &dyn_buckets, 0, "Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &dyn_count, 0, "Number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &dyn_max, 0, "Max number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &static_count, 0, "Number of static rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); #ifdef INET6 /* * IPv6 specific variables */ SYSCTL_DECL(_net_inet6_ip6); static struct sysctl_ctx_list ip6_fw_sysctl_ctx; static struct sysctl_oid *ip6_fw_sysctl_tree; #endif /* INET6 */ #endif /* SYSCTL_NODE */ #ifdef IPFIREWALL_NAT MODULE_DEPEND(ipfw, libalias, 1, 1, 1); #endif static int fw_deny_unknown_exthdrs = 1; /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { struct ifaddr *ia; /* XXX lock? */ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return(1); /* match */ } } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. The 'versrcreach' * option just checks that the source address is reachable via any route * (except default) in the routing table. These two are a measure to block * forged packets. This is also commonly known as "anti-spoofing" or Unicast * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that syntax is * misleading. The check may be performed on all IP packets whether unicast, * multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp) { struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &ifnet, if_link) TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) return 1; } } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; rtalloc_ign((struct route *)&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port); return i; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp_seq ack, seq; int flags; struct { struct ip6_hdr ip6; struct tcphdr th; } ti; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) != 0) { m_freem(m); args->m = NULL; return; } ti.ip6 = *ip6; ti.th = *tcp; ti.th.th_seq = ntohl(ti.th.th_seq); ti.th.th_ack = ntohl(ti.th.th_ack); ti.ip6.ip6_nxt = IPPROTO_TCP; if (ti.th.th_flags & TH_ACK) { ack = 0; seq = ti.th.th_ack; flags = TH_RST; } else { ack = ti.th.th_seq; if ((m->m_flags & M_PKTHDR) != 0) { /* * total new data to ACK is: * total packet length, * minus the header length, * minus the tcp header length. */ ack += m->m_pkthdr.len - hlen - (ti.th.th_off << 2); } else if (ip6->ip6_plen) { ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) - hlen - (ti.th.th_off << 2); } else { m_freem(m); return; } if (tcp->th_flags & TH_SYN) ack++; seq = 0; flags = TH_RST|TH_ACK; } bcopy(&ti, ip6, sizeof(ti)); /* * m is only used to recycle the mbuf * The data in it is never read so we don't need * to correct the offsets or anything */ tcp_respond(NULL, ip6, tcp, m, ack, seq, flags); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else m_freem(m); args->m = NULL; } #endif /* INET6 */ static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ static void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { struct ether_header *eh = args->eh; char *action; int limit_reached = 0; char action2[40], proto[128], fragment[32]; fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (verbose_limit != 0 && norule_counter >= verbose_limit) return; norule_counter++; if (norule_counter == verbose_limit) limit_reached = verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(dummyaddr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; char src[48], dst[48]; struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntoa_r(ip->ip_src, src); inet_ntoa_r(ip->ip_dst, dst); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.frag_id6, ntohs(ip6->ip6_plen) - hlen, ntohs(offset & IP6F_OFF_MASK) << 3, (offset & IP6F_MORE_FRAG) ? "+" : ""); } else #endif { int ip_off, ip_len; if (eh != NULL) { /* layer 2 packets are as on the wire */ ip_off = ntohs(ip->ip_off); ip_len = ntohs(ip->ip_len); } else { ip_off = ip->ip_off; ip_len = ip->ip_len; } if (ip_off & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), offset << 3, (ip_off & IP_MF) ? "+" : ""); } } if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (curr_dyn_buckets - 1); return i; } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) return; last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < curr_dyn_buckets ; i++) { for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } /** * lookup a dynamic rule. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && q->dyn_type != O_LIMIT_PARENT) { if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6)) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.dst_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.src_ip6)) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ q->expire = time_uptime + dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } q->expire = time_uptime + dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (dyn_fin_lifetime >= dyn_keepalive_period) dyn_fin_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (dyn_rst_lifetime >= dyn_keepalive_period) dyn_rst_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } static ipfw_dyn_rule * lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (dyn_buckets > 65536) dyn_buckets = 1024; if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ dyn_buckets = curr_dyn_buckets; /* reset */ return; } curr_dyn_buckets = dyn_buckets; if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); for (;;) { ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) break; curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { realloc_dynamic_table(); if (ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = r; dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), dyn_count ); ) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ static int install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { static int last_log; ipfw_dyn_rule *q; struct in_addr da; char src[48], dst[48]; src[0] = '\0'; dst[0] = '\0'; DEB( printf("ipfw: %s: type %d 0x%08x %u -> 0x%08x %u\n", __func__, cmd->o.opcode, (args->f_id.src_ip), (args->f_id.src_port), (args->f_id.dst_ip), (args->f_id.dst_port)); ) IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); } IPFW_DYN_UNLOCK(); return (0); } if (dyn_count >= dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (dyn_count >= dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); } IPFW_DYN_UNLOCK(); return (1); /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: { /* limit number of sessions */ struct ipfw_flow_id id; ipfw_dyn_rule *parent; uint32_t conn_limit; uint16_t limit_mask = cmd->limit_mask; conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? tablearg : cmd->conn_limit; DEB( if (cmd->conn_limit == IP_FW_TABLEARG) printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " "(tablearg)\n", __func__, conn_limit); else printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", __func__, conn_limit); ) id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; id.addr_type = args->f_id.addr_type; if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { printf("ipfw: %s: add parent failed\n", __func__); IPFW_DYN_UNLOCK(); return (1); } if (parent->count >= conn_limit) { /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { if (fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* * XXX IPv6 flows are not * supported yet. */ if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(args->f_id.dst_ip); inet_ntoa_r(da, dst); } log(LOG_SECURITY | LOG_DEBUG, "%s %s:%u -> %s:%u, %s\n", "drop session", src, (args->f_id.src_port), dst, (args->f_id.dst_port), "too many entries"); } IPFW_DYN_UNLOCK(); return (1); } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); break; } default: printf("ipfw: %s: unknown dynamic rule type %u\n", __func__, cmd->o.opcode); IPFW_DYN_UNLOCK(); return (1); } /* XXX just set lifetime */ lookup_dyn_rule_locked(&args->f_id, NULL, NULL); IPFW_DYN_UNLOCK(); return (0); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ static struct mbuf * send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; struct tcphdr *tcp; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == 0) return (NULL); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC if (replyto != NULL) mac_create_mbuf_netlayer(replyto, m); else mac_create_mbuf_from_firewall(m); #else (void)replyto; /* don't warn about unused arg */ #endif m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); m->m_data += max_linkhdr; ip = mtod(m, struct ip *); bzero(ip, m->m_len); tcp = (struct tcphdr *)(ip + 1); /* no IP options */ ip->ip_p = IPPROTO_TCP; tcp->th_off = 5; /* * Assume we are sending a RST (or a keepalive in the reverse * direction), swap src and destination addresses and ports. */ ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); tcp->th_sport = htons(id->dst_port); tcp->th_dport = htons(id->src_port); if (flags & TH_RST) { /* we are sending a RST */ if (flags & TH_ACK) { tcp->th_seq = htonl(ack); tcp->th_ack = htonl(0); tcp->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; tcp->th_seq = htonl(0); tcp->th_ack = htonl(seq); tcp->th_flags = TH_RST | TH_ACK; } } else { /* * We are sending a keepalive. flags & TH_SYN determines * the direction, forward if set, reverse if clear. * NOTE: seq and ack are always assumed to be correct * as set by the caller. This may be confusing... */ if (flags & TH_SYN) { /* * we have to rewrite the correct addresses! */ ip->ip_dst.s_addr = htonl(id->dst_ip); ip->ip_src.s_addr = htonl(id->src_ip); tcp->th_dport = htons(id->dst_port); tcp->th_sport = htons(id->src_port); } tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; } /* * set ip_len to the payload size so we can compute * the tcp checksum on the pseudoheader * XXX check this, could save a couple of words ? */ ip->ip_len = htons(sizeof(struct tcphdr)); tcp->th_sum = in_cksum(m, m->m_pkthdr.len); /* * now fill fields left out earlier */ ip->ip_ttl = ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); } /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ if (args->eh != NULL) { ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } m_freem(args->m); } else m_freem(args->m); args->m = NULL; } /** * * Given an ip_fw *, lookup_next_rule will return a pointer * to the next rule, which can be either the jump * target (for skipto instructions) or the next one in the list (in * all other cases including a missing jump target). * The result is also written in the "next_rule" field of the rule. * Backward jumps are not allowed, so start looking from the next * rule... * * This never returns NULL -- in case we do not have an exact match, * the next rule is returned. When the ruleset is changed, * pointers are flushed so we are always correct. */ static struct ip_fw * lookup_next_rule(struct ip_fw *me) { struct ip_fw *rule = NULL; ipfw_insn *cmd; /* look for action, in case it is a skipto */ cmd = ACTION_PTR(me); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); if (cmd->opcode == O_ALTQ) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); if ( cmd->opcode == O_SKIPTO ) for (rule = me->next; rule ; rule = rule->next) if (rule->rulenum >= cmd->arg1) break; if (rule == NULL) /* failure or not a skipto */ rule = me->next; me->next_rule = rule; return rule; } static int add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { struct radix_node_head *rnh; struct table_entry *ent; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); if (ent == NULL) return (ENOMEM); ent->value = value; ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; IPFW_WLOCK(&layer3_chain); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { IPFW_WUNLOCK(&layer3_chain); free(ent, M_IPFW_TBL); return (EEXIST); } IPFW_WUNLOCK(&layer3_chain); return (0); } static int del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa, mask; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; sa.sin_len = mask.sin_len = 8; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; IPFW_WLOCK(ch); ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); if (ent == NULL) { IPFW_WUNLOCK(ch); return (ESRCH); } IPFW_WUNLOCK(ch); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static int flush_table(struct ip_fw_chain *ch, uint16_t tbl) { struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; KASSERT(rnh != NULL, ("NULL IPFW table")); rnh->rnh_walktree(rnh, flush_table_entry, rnh); return (0); } static void flush_tables(struct ip_fw_chain *ch) { uint16_t tbl; IPFW_WLOCK_ASSERT(ch); for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) flush_table(ch, tbl); } static int init_tables(struct ip_fw_chain *ch) { int i; uint16_t j; for (i = 0; i < IPFW_TABLES_MAX; i++) { if (!rn_inithead((void **)&ch->tables[i], 32)) { for (j = 0; j < i; j++) { (void) flush_table(ch, j); } return (ENOMEM); } } return (0); } static int lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa; if (tbl >= IPFW_TABLES_MAX) return (0); rnh = ch->tables[tbl]; sa.sin_len = 8; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); if (ent != NULL) { *val = ent->value; return (1); } return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } static int count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; *cnt = 0; rnh->rnh_walktree(rnh, count_table_entry, cnt); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } static int dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl->tbl]; tbl->cnt = 0; rnh->rnh_walktree(rnh, dump_table_entry, tbl); return (0); } static void fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp) { struct ucred *cr; if (inp->inp_socket != NULL) { cr = inp->inp_socket->so_cred; ugp->fw_prid = jailed(cr) ? cr->cr_prison->pr_id : -1; ugp->fw_uid = cr->cr_uid; ugp->fw_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, ugp->fw_groups, sizeof(ugp->fw_groups)); } } static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; int match; gid_t *gp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *lookup == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { fill_ugid_cache(inp, ugp); *lookup = 1; } } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*lookup == -1) return (0); if (proto == IPPROTO_TCP) { wildcard = 0; pi = &tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = INPLOOKUP_WILDCARD; pi = &udbinfo; } else return 0; match = 0; if (*lookup == 0) { INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb != NULL) { INP_LOCK(pcb); if (pcb->inp_socket != NULL) { fill_ugid_cache(pcb, ugp); *lookup = 1; } INP_UNLOCK(pcb); } INP_INFO_RUNLOCK(pi); if (*lookup == 0) { /* * If the lookup did not yield any results, there * is no sense in coming back and trying again. So * we can set lookup to -1 and ensure that we wont * bother the pcb system again. */ *lookup = -1; return (0); } } if (insn->o.opcode == O_UID) match = (ugp->fw_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) { for (gp = ugp->fw_groups; gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++) if (*gp == (gid_t)insn->d[0]) { match = 1; break; } } else if (insn->o.opcode == O_JAIL) match = (ugp->fw_prid == (int)insn->d[0]); return match; } #ifdef IPFIREWALL_NAT static eventhandler_tag ifaddr_event_tag; static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; IPFW_WLOCK(&layer3_chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &layer3_chain.nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) { mtx_lock(&ifp->if_addr_mtx); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); } mtx_unlock(&ifp->if_addr_mtx); } } IPFW_WUNLOCK(&layer3_chain); } static void flush_nat_ptrs(const int i) { struct ip_fw *rule; IPFW_WLOCK_ASSERT(&layer3_chain); for (rule = layer3_chain.rules; rule; rule = rule->next) { ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule); if (cmd->o.opcode != O_NAT) continue; if (cmd->nat != NULL && cmd->nat->id == i) cmd->nat = NULL; } } static struct cfg_nat * lookup_nat(const int i) { struct cfg_nat *ptr; LIST_FOREACH(ptr, &layer3_chain.nat, _next) if (ptr->id == i) return(ptr); return (NULL); } #define HOOK_NAT(b, p) do { \ IPFW_WLOCK_ASSERT(&layer3_chain); \ LIST_INSERT_HEAD(b, p, _next); \ } while (0) #define UNHOOK_NAT(p) do { \ IPFW_WLOCK_ASSERT(&layer3_chain); \ LIST_REMOVE(p, _next); \ } while (0) #define HOOK_REDIR(b, p) do { \ LIST_INSERT_HEAD(b, p, _next); \ } while (0) #define HOOK_SPOOL(b, p) do { \ LIST_INSERT_HEAD(b, p, _next); \ } while (0) static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case REDIR_ADDR: case REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r, *ser_r; struct cfg_spool *s, *ser_s; int cnt, off, i; char *panic_err; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct cfg_redir *)&buf[off]; r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += SOF_REDIR; r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } if (r->alink[0] == NULL) { panic_err = "LibAliasRedirect* returned NULL"; goto bad; } else /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct cfg_spool *)&buf[off]; s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); memcpy(s, ser_s, SOF_SPOOL); LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += SOF_SPOOL; /* Hook spool entry. */ HOOK_SPOOL(&r->spool_chain, s); } /* And finally hook this redir entry. */ HOOK_REDIR(&ptr->redir_chain, r); } return (1); bad: /* something really bad happened: panic! */ panic("%s\n", panic_err); } #endif /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, or NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->cookie a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state during the processing of a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ struct ip_fw_ugid fw_ugid_cache; int ugid_lookup = 0; /* * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG * associated with a packet input on a divert socket. This * will allow to distinguish traffic and its direction when * it originates from a divert socket. */ u_int divinput_flags = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset == 0 means there is no Fragment Header. * If offset != 0 for IPv6 always use correct mask to * get the correct offset because we add IP6F_MORE_FRAG * to be able to dectect the first fragment which would * otherwise have offset = 0. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ u_int16_t ip_len=0; int pktlen; u_int16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &layer3_chain; struct m_tag *mtag; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; u_int16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; if (m->m_flags & M_SKIP_FIREWALL) return (IP_FW_PASS); /* accept */ pktlen = m->m_pkthdr.len; proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(len, p, T) \ do { \ int x = (len) + sizeof(T); \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); args->f_id.flags = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; /* Add IP6F_MORE_FRAG for offset of first * fragment to be != 0. */ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.frag_id6 = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_ext); /* Packet ends here. if ip6e_len!=0 octets * must be ignored. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; if (args->eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { offset = ip->ip_off & IP_OFFMASK; ip_len = ip->ip_len; } pktlen = ip_len < pktlen ? ip_len : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (args->rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. * * If fw_one_pass != 0 then just accept it. * XXX should not happen here, but optimized out in * the caller. */ if (fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } f = args->rule->next_rule; if (f == NULL) f = lookup_next_rule(args->rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ int skipto = mtag ? divert_cookie(mtag) : 0; f = chain->rules; if (args->eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) { IPFW_RUNLOCK(chain); return (IP_FW_DENY); /* invalid */ } while (f && f->rulenum <= skipto) f = f->next; if (f == NULL) { /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } } } /* reset divert rule to avoid confusion later */ if (mtag) { divinput_flags = divert_info(mtag) & (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG); m_tag_delete(m, mtag); } /* * Now scan the rules, and parse microinstructions for each rule. */ for (; f; f = f->next) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ again: if (set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ check_body: cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset!=0) break; if (is_ipv6) /* XXX to be fixed later */ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &fw_ugid_cache, &ugid_lookup, args->inp); break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: match = (cmd->arg1 & 1 && divinput_flags & IP_FW_DIVERT_LOOPBACK_FLAG) || (cmd->arg1 & 2 && divinput_flags & IP_FW_DIVERT_OUTPUT_FLAG); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v; match = lookup_table(chain, cmd->arg1, a, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); } break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); } break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = ip_len; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = ip_len - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == TCP(ulp)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; at = pf_get_mtag(m); if (at == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at->qid = altq->qid; if (is_ipv4) at->af = AF_INET; else at->af = AF_LINK; at->hdr = ip; m_tag_prepend(m, mtag); break; } case O_LOG: if (fw_verbose) ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL) : #endif verify_path(src_ip, NULL))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); break; case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); } else if (mtag == NULL) { if ((mtag = m_tag_alloc(MTAG_IPFW, tag, 0, M_NOWAIT)) != NULL) m_tag_prepend(m, mtag); } match = (cmd->len & F_NOT) ? 0: 1; break; } case O_TAGGED: { uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to do a 'goto done'). * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * ('goto next_rule', equivalent to a 'break 2'), * or to the SKIPTO target ('goto again' after * having set f, cmd and l), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet * must be dropped * ('goto done' after setting retval); * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * ('goto check_body') if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found ('goto next_rule'). * The result of the lookup is cached to make * further instances of these opcodes are * effectively NOPs. */ case O_LIMIT: case O_KEEP_STATE: if (install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { retval = IP_FW_DENY; goto done; /* error/limit violation */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule. */ q->pcnt++; q->bcnt += pktlen; f = q->rule; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; IPFW_DYN_UNLOCK(); goto check_body; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) goto next_rule; match = 1; break; case O_ACCEPT: retval = 0; /* accept */ goto done; case O_PIPE: case O_QUEUE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = IP_FW_DUMMYNET; goto done; case O_DIVERT: case O_TEE: { struct divert_tag *dt; if (args->eh) /* not on layer 2 */ break; mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT); if (mtag == NULL) { /* XXX statistic */ /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } dt = (struct divert_tag *)(mtag+1); dt->cookie = f->rulenum; if (cmd->arg1 == IP_FW_TABLEARG) dt->info = tablearg; else dt->info = cmd->arg1; m_tag_prepend(m, mtag); retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; goto done; } case O_COUNT: case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; if (cmd->opcode == O_COUNT) goto next_rule; /* handle skipto */ if (f->next_rule == NULL) lookup_next_rule(f); f = f->next_rule; goto again; case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, ip_len, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(args->f_id.flags) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; goto done; case O_FORWARD_IP: { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) { if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; } goto done; case O_NETGRAPH: case O_NGTEE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; goto done; #ifdef IPFIREWALL_NAT case O_NAT: { struct cfg_nat *t; struct mbuf *mcl; /* XXX - libalias duct tape */ int ldt; char *c; ldt = 0; args->rule = f; /* Report matching rule. */ retval = 0; t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { t = lookup_nat(cmd->arg1); if (t == NULL) { retval = IP_FW_DENY; goto done; } else ((ipfw_insn_nat *)cmd)->nat = t; } if ((mcl = m_megapullup(m, m->m_pkthdr.len)) == NULL) goto badnat; ip = mtod(mcl, struct ip *); if (args->eh == NULL) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); } /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only * pseudo-header checksum calculated * and libalias will screw it[1], so * mark them for later fix. Moreover * there are cases when libalias * modify tcp packet data[2], mark it * for later fix too. * * [1] libalias was never meant to run * in kernel, so it doesn't have any * knowledge about checksum * offloading, and it expects a packet * with a full internet * checksum. Unfortunately, packets * generated locally will have just the * pseudo header calculated, and when * libalias tries to adjust the * checksum it will actually screw it. * * [2] when libalias modify tcp's data * content, full TCP checksum has to * be recomputed: the problem is that * libalias doesn't have any idea * about checksum offloading To * workaround this, we do not do * checksumming in LibAlias, but only * mark the packets in th_x2 field. If * we receive a marked packet, we * calculate correct checksum for it * aware of offloading. Why such a * terrible hack instead of * recalculating checksum for each * packet? Because the previous * checksum was not checked! * Recalculating checksums for EVERY * packet will hide ALL transmission * errors. Yes, marked packets still * suffer from this problem. But, * sigh, natd(8) has this problem, * too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); if (oif == NULL) retval = LibAliasIn(t->lib, c, MCLBYTES); else retval = LibAliasOut(t->lib, c, MCLBYTES); if (retval != PKT_ALIAS_OK) { /* XXX - should i add some logging? */ m_free(mcl); badnat: args->m = NULL; retval = IP_FW_DENY; goto done; } mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (th->th_x2) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; u_short cksum; ip->ip_len = ntohs(ip->ip_len); cksum = in_pseudo( ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)) ); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ th->th_x2 = 0; th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* * No hw checksum offloading: do it * by ourself. */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } ip->ip_len = htons(ip->ip_len); } if (args->eh == NULL) { ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } args->m = mcl; retval = IP_FW_NAT; goto done; } #endif default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner for, scan opcodes */ next_rule:; /* try next rule */ } /* end of outer for, scan rules */ printf("ipfw: ouch!, skip past end of rules, denying packet\n"); IPFW_RUNLOCK(chain); return (IP_FW_DENY); done: /* Update statistics */ f->pcnt++; f->bcnt += pktlen; f->timestamp = time_uptime; IPFW_RUNLOCK(chain); return (retval); pullup_failed: if (fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * When a rule is added/deleted, clear the next_rule pointers in all rules. * These will be reconstructed on the fly as packets are matched. */ static void flush_rule_ptrs(struct ip_fw_chain *chain) { struct ip_fw *rule; IPFW_WLOCK_ASSERT(chain); for (rule = chain->rules; rule; rule = rule->next) rule->next_rule = NULL; } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. */ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) return (EINVAL); rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO); if (rule == NULL) return (ENOSPC); bcopy(input_rule, rule, l); rule->next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; IPFW_WLOCK(chain); if (chain->rules == NULL) { /* default rule */ chain->rules = rule; goto done; } /* * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ if (autoinc_step < 1) autoinc_step = 1; else if (autoinc_step > 1000) autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default */ for (f = chain->rules; f; f = f->next) { if (f->rulenum == IPFW_DEFAULT_RULE) break; rule->rulenum = f->rulenum; } if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) rule->rulenum += autoinc_step; input_rule->rulenum = rule->rulenum; } /* * Now insert the new rule in the right place in the sorted list. */ for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) { if (f->rulenum > rule->rulenum) { /* found the location */ if (prev) { rule->next = f; prev->next = rule; } else { /* head insert */ rule->next = chain->rules; chain->rules = rule; } break; } } flush_rule_ptrs(chain); done: static_count++; static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", rule->rulenum, static_count);) return (0); } /** * Remove a static rule (including derived * dynamic rules) * and place it on the ``reap list'' for later reclamation. * The caller is in charge of clearing rule pointers to avoid * dangling pointers. * @return a pointer to the next entry. * Arguments are not checked, so they better be correct. */ static struct ip_fw * remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { struct ip_fw *n; int l = RULESIZE(rule); IPFW_WLOCK_ASSERT(chain); n = rule->next; IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); if (prev == NULL) chain->rules = n; else prev->next = n; static_count--; static_len -= l; rule->next = chain->reap; chain->reap = rule; return n; } /** * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. */ static void reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; if (DUMMYNET_LOADED) ip_dn_ruledel_ptr(rule); free(rule, M_IPFW); } } /* * Remove all rules from a chain (except rules in set RESVD_SET * unless kill_default = 1). The caller is responsible for * reclaiming storage for the rules left in chain->reap. */ static void free_chain(struct ip_fw_chain *chain, int kill_default) { struct ip_fw *prev, *rule; IPFW_WLOCK_ASSERT(chain); flush_rule_ptrs(chain); /* more efficient to do outside the loop */ for (prev = NULL, rule = chain->rules; rule ; ) if (kill_default || rule->set != RESVD_SET) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } /** * Remove all rules with given number, and also do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an u_int32_t. The low 16 bit are the rule or set number, * the next 8 bits are the new set, the top 8 bits are the command: * * 0 delete rules with given number * 1 delete rules with given set number * 2 move rules with given number to new set * 3 move rules with given set number to new set * 4 swap sets with given numbers * 5 delete rules with given number and with given set number */ static int del_entry(struct ip_fw_chain *chain, u_int32_t arg) { struct ip_fw *prev = NULL, *rule; u_int16_t rulenum; /* rule or old_set */ u_int8_t cmd, new_set; rulenum = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 5 || new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2 || cmd == 5) { if (rulenum >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (rulenum > RESVD_SET) /* old_set */ return EINVAL; } IPFW_WLOCK(chain); rule = chain->rules; chain->reap = NULL; switch (cmd) { case 0: /* delete rules with given number */ /* * locate first rule to delete */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return EINVAL; } /* * flush pointers outside the loop, then delete all matching * rules. prev remains the same throughout the cycle. */ flush_rule_ptrs(chain); while (rule->rulenum == rulenum) rule = remove_rule(chain, rule, prev); break; case 1: /* delete all rules with given set number */ flush_rule_ptrs(chain); rule = chain->rules; while (rule->rulenum < IPFW_DEFAULT_RULE) if (rule->set == rulenum) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } break; case 2: /* move rules with given number to new set */ rule = chain->rules; for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->rulenum == rulenum) rule->set = new_set; break; case 3: /* move rules with given set number to new set */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; break; case 4: /* swap two sets */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; else if (rule->set == new_set) rule->set = rulenum; break; case 5: /* delete rules with given number and with given set number. * rulenum - given rule number; * new_set - given set number. */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return (EINVAL); } flush_rule_ptrs(chain); while (rule->rulenum == rulenum) { if (rule->set == new_set) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } } /* * Look for rules to reclaim. We grab the list before * releasing the lock then reclaim them w/o the lock to * avoid a LOR with dummynet. */ rule = chain->reap; chain->reap = NULL; IPFW_WUNLOCK(chain); if (rule) reap_rules(rule); return 0; } /* * Clear counters for a specific rule. * The enclosing "table" is assumed locked. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, * the next 8 bits are the set number, the top 8 bits are the command: * 0 work with rules from all set's; * 1 work with rules only from specified set. * Specified rule number is zero if we want to clear all entries. * log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { struct ip_fw *rule; char *msg; uint16_t rulenum = arg & 0xffff; uint8_t set = (arg >> 16) & 0xff; uint8_t cmd = (arg >> 24) & 0xff; if (cmd > 1) return (EINVAL); if (cmd == 1 && set > RESVD_SET) return (EINVAL); IPFW_WLOCK(chain); if (rulenum == 0) { norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) { /* Skip rules from another set. */ if (cmd == 1 && rule->set != set) continue; clear_counters(rule, log_only); } msg = log_only ? "ipfw: All logging counts reset.\n" : "ipfw: Accounting cleared.\n"; } else { int cleared = 0; /* * We can have multiple rules with the same number, so we * need to clear them all. */ for (rule = chain->rules; rule; rule = rule->next) if (rule->rulenum == rulenum) { while (rule && rule->rulenum == rulenum) { if (cmd == 0 || rule->set == set) clear_counters(rule, log_only); rule = rule->next; } cleared = 1; break; } if (!cleared) { /* we did not find any matching rules */ IPFW_WUNLOCK(chain); return (EINVAL); } msg = log_only ? "ipfw: Entry %d logging count reset.\n" : "ipfw: Entry %d cleared.\n"; } IPFW_WUNLOCK(chain); if (fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } /* * Check validity of the structure before insert. * Fortunately rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } DEB(printf("ipfw: opcode %d\n", cmd->opcode);) switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: case O_TAG: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TAGGED: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (!NG_IPFW_LOADED) return EINVAL; else goto check_size; case O_NAT: #ifdef IPFIREWALL_NAT if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule; int i; time_t boot_seconds; boot_seconds = boottime.tv_sec; /* XXX this can take a long time and locking will block packet flow */ IPFW_RLOCK(chain); for (rule = chain->rules; rule ; rule = rule->next) { /* * Verify the entry fits in the buffer in case the * rules changed between calculating buffer space and * now. This would be better done using a generation * number but should suffice for now. */ i = RULESIZE(rule); if (bp + i <= ep) { bcopy(rule, bp, i); /* * XXX HACK. Store the disable mask in the "next" pointer * in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), sizeof(set_disable)); if (((struct ip_fw *)bp)->timestamp) ((struct ip_fw *)bp)->timestamp += boot_seconds; bp += i; } } IPFW_RUNLOCK(chain); if (ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets; i++) for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store set number into high word of * dst->rule pointer. */ bcopy(&(p->rule->set), &dst->rule + sizeof(p->rule->rulenum), sizeof(p->rule->set)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); } return (bp - (char *)buf); } /** * {set|get}sockopt parser. */ static int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size; struct ip_fw *buf, *rule; u_int32_t rulenum[2]; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ size = static_len; /* size of static rules */ if (ipfw_dyn_v) /* add size of dyn.rules */ size += (dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know * how much room is needed, do not bother filling up the * buffer, just jump to the sooptcopyout. */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, ipfw_getrules(&layer3_chain, buf, size)); free(buf, M_TEMP); break; case IP_FW_FLUSH: /* * Normally we cannot release the lock on each iteration. * We could do it here only because we start from the head all * the times so there is no risk of missing some entries. * On the other hand, the risk is that we end up with * a very inconsistent ruleset, so better keep the lock * around the whole cycle. * * XXX this code can be improved by resetting the head of * the list to point to the default rule, and then freeing * the old list without the need for a lock. */ IPFW_WLOCK(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 0 /* keep default rule */); rule = layer3_chain.reap; layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (rule != NULL) reap_rules(rule); break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw) ); if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { error = add_rule(&layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ error = del_entry(&layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ set_disable = (set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, rulenum, sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; } error = zero_entry(&layer3_chain, rulenum[0], sopt->sopt_name == IP_FW_RESETLOG); break; case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = add_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = del_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; IPFW_WLOCK(&layer3_chain); error = flush_table(&layer3_chain, tbl); IPFW_WUNLOCK(&layer3_chain); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; IPFW_RLOCK(&layer3_chain); error = count_table(&layer3_chain, tbl, &cnt); IPFW_RUNLOCK(&layer3_chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); IPFW_RLOCK(&layer3_chain); error = dump_table(&layer3_chain, tbl); IPFW_RUNLOCK(&layer3_chain); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; #ifdef IPFIREWALL_NAT case IP_FW_NAT_CFG: { struct cfg_nat *ptr, *ser_n; char *buf; buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); ser_n = (struct cfg_nat *)buf; /* * Find/create nat rule. */ IPFW_WLOCK(&layer3_chain); ptr = lookup_nat(ser_n->id); if (ptr == NULL) { /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_NOWAIT | M_ZERO); if (ptr == NULL) { IPFW_WUNLOCK(&layer3_chain); free(buf, M_IPFW); return (ENOSPC); } ptr->lib = LibAliasInit(NULL); if (ptr->lib == NULL) { IPFW_WUNLOCK(&layer3_chain); free(ptr, M_IPFW); free(buf, M_IPFW); return (EINVAL); } LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarly unhook it. */ UNHOOK_NAT(ptr); flush_nat_ptrs(ser_n->id); } IPFW_WUNLOCK(&layer3_chain); /* * Basic nat configuration. */ ptr->id = ser_n->id; /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ser_n->ip; ptr->redir_cnt = ser_n->redir_cnt; ptr->mode = ser_n->mode; LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode); LibAliasSetAddress(ptr->lib, ptr->ip); memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); free(buf, M_IPFW); IPFW_WLOCK(&layer3_chain); HOOK_NAT(&layer3_chain.nat, ptr); IPFW_WUNLOCK(&layer3_chain); } break; case IP_FW_NAT_DEL: { struct cfg_nat *ptr; int i; error = sooptcopyin(sopt, &i, sizeof i, sizeof i); IPFW_WLOCK(&layer3_chain); ptr = lookup_nat(i); if (ptr == NULL) { error = EINVAL; IPFW_WUNLOCK(&layer3_chain); break; } UNHOOK_NAT(ptr); flush_nat_ptrs(i); IPFW_WUNLOCK(&layer3_chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } break; case IP_FW_NAT_GET_CONFIG: { uint8_t *data; struct cfg_nat *n; struct cfg_redir *r; struct cfg_spool *s; int nat_cnt, off; nat_cnt = 0; off = sizeof(nat_cnt); data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); IPFW_RLOCK(&layer3_chain); /* Serialize all the data. */ LIST_FOREACH(n, &layer3_chain.nat, _next) { nat_cnt++; if (off + SOF_NAT < NAT_BUF_LEN) { bcopy(n, &data[off], SOF_NAT); off += SOF_NAT; LIST_FOREACH(r, &n->redir_chain, _next) { if (off + SOF_REDIR < NAT_BUF_LEN) { bcopy(r, &data[off], SOF_REDIR); off += SOF_REDIR; LIST_FOREACH(s, &r->spool_chain, _next) { if (off + SOF_SPOOL < NAT_BUF_LEN) { bcopy(s, &data[off], SOF_SPOOL); off += SOF_SPOOL; } else goto nospace; } } else goto nospace; } } else goto nospace; } bcopy(&nat_cnt, data, sizeof(nat_cnt)); IPFW_RUNLOCK(&layer3_chain); error = sooptcopyout(sopt, data, NAT_BUF_LEN); free(data, M_IPFW); break; nospace: IPFW_RUNLOCK(&layer3_chain); printf("serialized data buffer not big enough:" "please increase NAT_BUF_LEN\n"); free(data, M_IPFW); } break; case IP_FW_NAT_GET_LOG: { uint8_t *data; struct cfg_nat *ptr; int i, size, cnt, sof; data = NULL; sof = LIBALIAS_BUF_SIZE; cnt = 0; IPFW_RLOCK(&layer3_chain); size = i = 0; LIST_FOREACH(ptr, &layer3_chain.nat, _next) { if (ptr->lib->logDesc == NULL) continue; cnt++; size = cnt * (sof + sizeof(int)); data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(&layer3_chain); return (ENOSPC); } bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], sof); i += sof; } IPFW_RUNLOCK(&layer3_chain); error = sooptcopyout(sopt, data, size); free(data, M_IPFW); } break; #endif default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } /** * dummynet needs a reference to the default rule, because rules can be * deleted while packets hold a reference to them. When this happens, * dummynet changes the reference to the default rule (it could well be a * NULL pointer, but this way we do not need to check for the special * case, plus here he have info on the default behaviour). */ struct ip_fw *ip_fw_default_rule; /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * __unused unused) { struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets ; i++) { for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ( time_uptime+dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ *mtailp = send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; *mtailp = send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; } } IPFW_DYN_UNLOCK(); for (m = mnext = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } done: callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); } int ipfw_init(void) { struct ip_fw default_rule; int error; #ifdef INET6 /* Setup IPv6 fw sysctl tree. */ sysctl_ctx_init(&ip6_fw_sysctl_ctx); ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, &fw_deny_unknown_exthdrs, 0, "Deny packets with unknown IPv6 Extension Headers"); #endif layer3_chain.rules = NULL; IPFW_LOCK_INIT(&layer3_chain); ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); - callout_init(&ipfw_timeout, NET_CALLOUT_MPSAFE); + callout_init(&ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = RESVD_SET; default_rule.cmd[0].len = 1; default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 1 ? O_ACCEPT : #endif O_DENY; error = add_rule(&layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_default_rule = layer3_chain.rules; printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); #ifdef IPFIREWALL_VERBOSE fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif if (fw_verbose == 0) printf("disabled\n"); else if (verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", verbose_limit); error = init_tables(&layer3_chain); if (error) { IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); #ifdef IPFIREWALL_NAT LIST_INIT(&layer3_chain.nat); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); #endif return (0); } void ipfw_destroy(void) { struct ip_fw *reap; #ifdef IPFIREWALL_NAT struct cfg_nat *ptr, *ptr_temp; #endif ip_fw_chk_ptr = NULL; ip_fw_ctl_ptr = NULL; callout_drain(&ipfw_timeout); IPFW_WLOCK(&layer3_chain); flush_tables(&layer3_chain); #ifdef IPFIREWALL_NAT LIST_FOREACH_SAFE(ptr, &layer3_chain.nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); #endif layer3_chain.reap = NULL; free_chain(&layer3_chain, 1 /* kill default rule */); reap = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (reap != NULL) reap_rules(reap); IPFW_DYN_LOCK_DESTROY(); uma_zdestroy(ipfw_dyn_rule_zone); IPFW_LOCK_DESTROY(&layer3_chain); #ifdef INET6 /* Free IPv6 fw sysctl tree. */ sysctl_ctx_free(&ip6_fw_sysctl_ctx); #endif printf("IP firewall unloaded\n"); } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 171636) +++ head/sys/netinet/ip_mroute.c (revision 171637) @@ -1,3156 +1,3156 @@ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include #include /* * Control debugging code for rsvp and multicast routing code. * Can only set them with the debugger. */ static u_int rsvpdebug; /* non-zero enables debugging */ static u_int mrtdebug; /* any set of the flags below */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_PIM 0x20 #define VIFI_INVALID ((vifi_t) -1) #define M_HASCL(m) ((m)->m_flags & M_EXT) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. It may be better to add more fine-grained locking later; * it's not clear how performance-critical this code is. * * XXX: This module could particularly benefit from being cleaned * up to use the macros. * */ static struct mrtstat mrtstat; SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); static struct mfc *mfctable[MFCTBLSIZ]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() do { \ mtx_assert(&mrouter_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() do { \ mtx_assert(&mfc_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static struct vif viftable[MAXVIFS]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, &viftable, sizeof(viftable), "S,vif[MAXVIFS]", "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static u_char nexpire[MFCTBLSIZ]; static eventhandler_tag if_detach_event_tag = NULL; static struct callout expire_upcalls_ch; #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ #define ENCAP_TTL 64 /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; static struct callout bw_meter_ch; #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; static u_int bw_upcalls_n; /* # of pending upcalls */ static struct callout bw_upcalls_ch; #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static struct pimstat pimstat; SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, &pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); extern struct domain inetdomain; struct protosw in_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_output = (pr_output_t*)rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static const struct encaptab *pim_encap_cookie; #ifdef INET6 /* ip6_mroute.c glue */ extern struct in6_protosw in6_pim_protosw; static const struct encaptab *pim6_encap_cookie; extern int X_ip6_mrouter_set(struct socket *, struct sockopt *); extern int X_ip6_mrouter_get(struct socket *, struct sockopt *); extern int X_ip6_mrouter_done(void); extern int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); extern int X_mrt6_ioctl(int, caddr_t); #endif static int pim_encapcheck(const struct mbuf *, int, int, void *); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static struct ifnet multicast_register_if; static vifi_t reg_vif_num = VIFI_INVALID; /* * Private variables. */ static vifi_t numvifs; static u_long X_ip_mcast_src(int vifi); static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); static int X_mrt_ioctl(int cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *arg __unused, struct ifnet *); static int ip_mrouter_init(struct socket *, int); static int add_vif(struct vifctl *); static int del_vif_locked(vifi_t); static int del_vif(vifi_t); static int add_mfc(struct mfcctl2 *); static int del_mfc(struct mfcctl2 *); static int set_api_config(uint32_t *); /* chose API capabilities */ static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static int set_assert(int); static void expire_upcalls(void *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static void send_packet(struct vif *, struct mbuf *); /* * Bandwidth monitoring */ static void free_bw_list(struct bw_meter *list); static int add_bw_upcall(struct bw_upcall *); static int del_bw_upcall(struct bw_upcall *); static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp); static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); static void bw_upcalls_send(void); static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); static void unschedule_bw_meter(struct bw_meter *x); static void bw_meter_process(void); static void expire_bw_upcalls_send(void *); static void expire_bw_meter_process(void *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); /* * whether or not special PIM assert processing is enabled. */ static int pim_assert; /* * Rate limit for assert notification messages, in usec */ #define ASSERT_MSG_TIME 3000000 /* * Kernel multicast routing API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static uint32_t mrt_api_config = 0; /* * Hash function for a source, group entry */ #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ ((g) >> 20) ^ ((g) >> 10) ^ (g)) /* * Find a route for a given origin IP address and Multicast group address * Statistics are updated by the caller if needed * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) */ static struct mfc * mfc_find(in_addr_t o, in_addr_t g) { struct mfc *rt; MFC_LOCK_ASSERT(); for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) if ((rt->mfc_origin.s_addr == o) && (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) break; return rt; } /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) { \ int xxs; \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; static int version = 0x0305; /* !!! why is this here? XXX */ switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &version, sizeof version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(int cmd, caddr_t data) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(req->src.s_addr, req->grp.s_addr); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = viftable[vifi].v_pkt_in; req->ocount = viftable[vifi].v_pkt_out; req->ibytes = viftable[vifi].v_bytes_in; req->obytes = viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void ip_mrouter_reset(void) { bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); pim_assert = 0; mrt_api_config = 0; - callout_init(&expire_upcalls_ch, NET_CALLOUT_MPSAFE); + callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); bw_upcalls_n = 0; bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); - callout_init(&bw_upcalls_ch, NET_CALLOUT_MPSAFE); - callout_init(&bw_meter_ch, NET_CALLOUT_MPSAFE); + callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); + callout_init(&bw_meter_ch, CALLOUT_MPSAFE); } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; int i; struct mfc *mfc; struct mfc *nmfc; struct mfc **ppmfc; /* Pointer to previous node's next-pointer */ struct rtdetq *pq; struct rtdetq *npq; MROUTER_LOCK(); if (ip_mrouter == NULL) { MROUTER_UNLOCK(); } /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Free any pending mbufs for this mfc. * 4. Free the associated mfc entry and state associated with this vif. * Be very careful about unlinking from a singly-linked list whose * "head node" is a pointer in a simple array. * 5. Free vif state. This should disable ALLMULTI on the interface. */ VIF_LOCK(); MFC_LOCK(); for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < MFCTBLSIZ; i++) { ppmfc = &mfctable[i]; for (mfc = mfctable[i]; mfc != NULL; ) { nmfc = mfc->mfc_next; if (mfc->mfc_parent == vifi) { for (pq = mfc->mfc_stall; pq != NULL; ) { npq = pq->next; m_freem(pq->m); free(pq, M_MRTABLE); pq = npq; } free_bw_list(mfc->mfc_bw_meter); free(mfc, M_MRTABLE); *ppmfc = nmfc; } else { ppmfc = &mfc->mfc_next; } mfc = nmfc; } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast routing */ static int ip_mrouter_init(struct socket *so, int version) { if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { MROUTER_UNLOCK(); return (ENOMEM); } callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); ip_mrouter = so; MROUTER_UNLOCK(); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init\n"); return 0; } /* * Disable multicast routing */ static int X_ip_mrouter_done(void) { vifi_t vifi; int i; struct ifnet *ifp; struct ifreq ifr; struct mfc *rt; struct rtdetq *rte; MROUTER_LOCK(); if (ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ ip_mrouter = NULL; mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_lcl_addr.s_addr != 0 && !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); so->sin_len = sizeof(struct sockaddr_in); so->sin_family = AF_INET; so->sin_addr.s_addr = INADDR_ANY; ifp = viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)viftable, sizeof(viftable)); numvifs = 0; pim_assert = 0; VIF_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); /* * Free all multicast forwarding cache entries. */ callout_stop(&expire_upcalls_ch); callout_stop(&bw_upcalls_ch); callout_stop(&bw_meter_ch); MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { for (rt = mfctable[i]; rt != NULL; ) { struct mfc *nr = rt->mfc_next; for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } free_bw_list(rt->mfc_bw_meter); free(rt, M_MRTABLE); rt = nr; } } bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); bw_upcalls_n = 0; bzero(bw_meter_timers, sizeof(bw_meter_timers)); MFC_UNLOCK(); reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_done\n"); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; pim_assert = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { int i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (numvifs > 0) { *apival = 0; return EPERM; } if (pim_assert) { *apival = 0; return EPERM; } for (i = 0; i < MFCTBLSIZ; i++) { if (mfctable[i] != NULL) { *apival = 0; return EPERM; } } mrt_api_config = *apival & mrt_api_support; *apival = mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { VIF_UNLOCK(); return EADDRINUSE; } if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { log(LOG_ERR, "tunnels are no longer supported\n"); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &multicast_register_if; if (mrtdebug) log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", (void *)&multicast_register_if); if (reg_vif_num == VIFI_INVALID) { if_initname(&multicast_register_if, "register_vif", 0); multicast_register_if.if_flags = IFF_LOOPBACK; reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; vifp->v_rsvp_on = 0; vifp->v_rsvpd = NULL; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; bzero(&vifp->v_route, sizeof(vifp->v_route)); /* Adjust numvifs up if the vifi is higher than numvifs */ if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); if (mrtdebug) log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x\n", vifcp->vifc_vifi, (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= numvifs) { return EINVAL; } vifp = &viftable[vifi]; if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); if (mrtdebug) log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); /* Adjust numvifs down */ for (vifi = numvifs; vifi > 0; vifi--) if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) break; numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; u_long hash; struct rtdetq *rte; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); /* If an entry already exists, just update the fields */ if (rt) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Find the entry for which the upcall was made and update */ hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", "multiple kernel entries", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ nexpire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } rt->mfc_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } init_mfc_params(rt, mfccp); rt->mfc_expire = 0; rt->mfc_stall = NULL; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; } } MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; struct mfc **nptr; u_long hash; struct bw_meter *list; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) if (origin.s_addr == rt->mfc_origin.s_addr && mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && rt->mfc_stall == NULL) break; if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } *nptr = rt->mfc_next; /* * free the bw_meter entries */ list = rt->mfc_bw_meter; rt->mfc_bw_meter = NULL; free(rt, M_MRTABLE); free_bw_list(list); MFC_UNLOCK(); return 0; } /* * Send a message to the routing daemon on the multicast routing socket */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), (void *)ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ static int last_log; if (last_log != time_uptime) { last_log = time_uptime; log(LOG_ERR, "ip_mforward: received source-routed packet from %lx\n", (u_long)ntohl(ip->ip_src.s_addr)); } return 1; } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { struct vif *vifp = viftable + vifi; printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), vifi, (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", vifp->v_ifp->if_xname); } error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); if (!imo) printf("In fact, no options were specified at all\n"); } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ ++mrtstat.mrts_mfc_lookups; rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; ++mrtstat.mrts_mfc_misses; mrtstat.mrts_no_route++; if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_DONTWAIT); if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copy(mb0, 0, hlen); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; nexpire[hash]++; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ rt->mfc_bw_meter = NULL; /* link into table */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; rt->mfc_stall = rte; } else { /* determine if q has overflowed */ int npkts = 0; struct rtdetq **p; /* * XXX ouch! we need to append to the list, but we * only have a pointer to the front, so we have to * scan the entire list every time. */ for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) npkts++; if (npkts > MAX_UPQ) { mrtstat.mrts_upq_ovflw++; non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* Add this entry to the end of the queue */ *p = rte; } rte->m = mb0; rte->ifp = ifp; rte->next = NULL; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mfc *mfc, **nptr; int i; MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { if (nexpire[i] == 0) continue; nptr = &mfctable[i]; for (mfc = *nptr; mfc != NULL; mfc = *nptr) { /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && --mfc->mfc_expire == 0) { if (mrtdebug & DEBUG_EXPIRE) log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", (u_long)ntohl(mfc->mfc_origin.s_addr), (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ for (rte = mfc->mfc_stall; rte; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } ++mrtstat.mrts_cache_cleanups; nexpire[i]--; /* * free the bw_meter entries */ while (mfc->mfc_bw_meter != NULL) { struct bw_meter *x = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } *nptr = mfc->mfc_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mfc_next; } } } MFC_UNLOCK(); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ip->ip_len; VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < numvifs) { if (viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + xmt_vif, m, rt); else phyint_send(ip, viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { /* came in the wrong interface */ if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); ++mrtstat.mrts_wrong_if; ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { struct timeval now; u_long delta; if (ifp == &multicast_register_if) pimstat.pims_rcv_registers_wrongiif++; /* Get vifi for the incoming packet */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ GET_TIME(now); TV_DELTA(now, rt->mfc_last_assert, delta); if (delta > ASSERT_MSG_TIME) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copy(m, 0, hlen); if (mm && (M_HASCL(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; rt->mfc_last_assert = now; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = im->im_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; } else { viftable[vifi].v_pkt_in++; viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; if (viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + vifi, m, rt); else phyint_send(ip, viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; GET_TIME(now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * check if a vif number is legal/ok. This is used by ip_output. */ static int X_legal_vif_num(int vif) { /* XXX unlocked, matter? */ return (vif >= 0 && vif < numvifs); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { /* XXX unlocked, matter? */ if (vifi >= 0 && vifi < numvifs) return viftable[vifi].v_lcl_addr.s_addr; else return INADDR_ANY; } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL); if (mrtdebug & DEBUG_XMIT) { log(LOG_DEBUG, "phyint_send on vif %td err %d\n", vifp - viftable, error); } } static int X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) { int error, vifi; if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) return error; VIF_LOCK(); if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ VIF_UNLOCK(); return EADDRNOTAVAIL; } if (sopt->sopt_name == IP_RSVP_VIF_ON) { /* Check if socket is available. */ if (viftable[vifi].v_rsvpd != NULL) { VIF_UNLOCK(); return EADDRINUSE; } viftable[vifi].v_rsvpd = so; /* This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 1; rsvp_on++; } } else { /* must be VIF_OFF */ /* * XXX as an additional consistency check, one could make sure * that viftable[vifi].v_rsvpd == so, otherwise passing so as * first parameter is pretty useless. */ viftable[vifi].v_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } VIF_UNLOCK(); return 0; } static void X_ip_rsvp_force_done(struct socket *so) { int vifi; /* Don't bother if it is not the right type of socket. */ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return; VIF_LOCK(); /* The socket may be attached to more than one vif...this * is perfectly legal. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_rsvpd == so) { viftable[vifi].v_rsvpd = NULL; /* This may seem silly, but we need to be sure we don't * over-decrement the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } } VIF_UNLOCK(); } static void X_rsvp_input(struct mbuf *m, int off) { int vifi; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; struct ifnet *ifp; if (rsvpdebug) printf("rsvp_input: rsvp_on %d\n",rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } if (rsvpdebug) printf("rsvp_input: check vifs\n"); #ifdef DIAGNOSTIC M_ASSERTPKTHDR(m); #endif ifp = m->m_pkthdr.rcvif; VIF_LOCK(); /* Find which vif the packet arrived on. */ for (vifi = 0; vifi < numvifs; vifi++) if (viftable[vifi].v_ifp == ifp) break; if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { /* * Drop the lock here to avoid holding it across rip_input. * This could make rsvpdebug printfs wrong. If you care, * record the state of stuff before dropping the lock. */ VIF_UNLOCK(); /* * If the old-style non-vif-associated socket is set, * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ if (ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ } else { if (rsvpdebug && vifi == numvifs) printf("rsvp_input: Can't find vif for packet.\n"); else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) printf("rsvp_input: No socket defined for vif %d\n",vifi); m_freem(m); } return; } rsvp_src.sin_addr = ip->ip_src; if (rsvpdebug && m) printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { if (rsvpdebug) printf("rsvp_input: Failed to append to socket\n"); } else { if (rsvpdebug) printf("rsvp_input: send packet up\n"); } VIF_UNLOCK(); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; GET_TIME(now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &bw_upcalls[bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = bw_upcalls_n * sizeof(bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (bw_upcalls_n == 0) return; /* No pending upcalls */ bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m->m_len = m->m_pkthdr.len = 0; m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { static uint32_t last_tv_sec; /* last time we processed this */ uint32_t loops; int i; struct timeval now, process_endtime; GET_TIME(now); if (last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - last_tv_sec; last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = bw_meter_timers[i]; bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *unused) { MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *unused) { if (mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_register_send: "); /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (mrt_api_config & MRT_MFC_RP) && (rt->mfc_rp.s_addr == INADDR_ANY)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((mrt_api_config & MRT_MFC_RP) && (rt->mfc_rp.s_addr != INADDR_ANY)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ip->ip_len <= mtu) { /* Turn the IP header into a valid one */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ MGETHDR(mb_first, M_DONTWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { if (mrtdebug & DEBUG_PIM) log(LOG_WARNING, "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ MGETHDR(mb_first, M_DONTWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_id = ip_newid(); ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); ip_outer->ip_src = viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ntohs(ip->ip_off) & IP_DF) ip_outer->ip_off |= IP_DF; pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * pim_encapcheck() is called by the encap[46]_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ void pim_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); struct pim *pim; int minlen; int datalen = ip->ip_len; int ip_tos; int iphlen = off; /* Keep statistics */ pimstat.pims_rcv_total_msgs++; pimstat.pims_rcv_total_bytes += datalen; /* * Validate lengths */ if (datalen < PIM_MINLEN) { pimstat.pims_rcv_tooshort++; log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { log(LOG_ERR, "pim_input: m_pullup failure\n"); return; } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { pimstat.pims_rcv_badsum++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: invalid checksum"); m_freem(m); return; } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { pimstat.pims_rcv_badversion++; log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return; } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: register vif not set: %d\n", reg_vif_num); m_freem(m); return; } /* XXX need refcnt? */ vifp = viftable[reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { pimstat.pims_rcv_tooshort++; pimstat.pims_rcv_badregisters++; log(LOG_ERR, "pim_input: register packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), ntohs(encap_ip->ip_len)); } /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: invalid IP version (%d) " "of the inner packet\n", encap_ip->ip_v); } m_freem(m); return; } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: inner packet of register is not " "multicast %lx\n", (u_long)ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return; } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); if (mcp == NULL) { log(LOG_ERR, "pim_input: pim register: could not copy register head\n"); m_freem(m); return; } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ pimstat.pims_rcv_registers_msgs++; pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: forwarding decapsulated register: " "src %lx, dst %lx, vif %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), reg_vif_num); } /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ rip_input(m, iphlen); return; } /* * XXX: This is common code for dealing with initialization for both * the IPv4 and IPv6 multicast forwarding paths. It could do with cleanup. */ static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); MFC_LOCK_INIT(); VIF_LOCK_INIT(); ip_mrouter_reset(); TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, pim_encapcheck, &in_pim_protosw, NULL); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } #ifdef INET6 pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM, pim_encapcheck, (struct protosw *)&in6_pim_protosw, NULL); if (pim6_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim6 encap\n"); if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } #endif ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; #ifdef INET6 ip6_mforward = X_ip6_mforward; ip6_mrouter_done = X_ip6_mrouter_done; ip6_mrouter_get = X_ip6_mrouter_get; ip6_mrouter_set = X_ip6_mrouter_set; mrt6_ioctl = X_mrt6_ioctl; #endif ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ if (ip_mrouter #ifdef INET6 || ip6_mrouter #endif ) return EINVAL; #ifdef INET6 if (pim6_encap_cookie) { encap_detach(pim6_encap_cookie); pim6_encap_cookie = NULL; } X_ip6_mrouter_done(); ip6_mforward = NULL; ip6_mrouter_done = NULL; ip6_mrouter_get = NULL; ip6_mrouter_set = NULL; mrt6_ioctl = NULL; #endif if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } X_ip_mrouter_done(); ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 171636) +++ head/sys/netinet/tcp_subr.c (revision 171637) @@ -1,2152 +1,2148 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); static int tcp_inflight_enable = 1; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_rttthresh; SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", "RTT threshold below which inflight will deactivate itself"); static int tcp_inflight_min = 6144; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; }; static uma_zone_t tcpcb_zone; MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.ipi_listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, &tcbinfo.ipi_hashmask); tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.ipi_porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR /* * These have to be type stable for the benefit of the timers. */ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_init(); syncache_init(); tcp_hc_init(); tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void tcp_fini(void *xtp) { callout_stop(&isn_callout); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_LOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else #endif { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_LOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; if (path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_create_mbuf_from_inpcb(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_reflect_mbuf_tcp(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); else #endif /* INET6 */ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ - if (NET_CALLOUT_MPSAFE) - callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx, - CALLOUT_RETURNUNLOCKED); - else - callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx, - (CALLOUT_RETURNUNLOCKED|CALLOUT_NETGIANT)); + callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx, + CALLOUT_RETURNUNLOCKED); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ INP_LOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we * delete the PCB. * * XXX: callout_stop() may race and a callout may already * try to obtain the INP_LOCK. Only callout_drain() would * stop this but it would cause a LOR thus we can't use it. * The tcp_timer() function contains a lot of checks to * handle this case rather gracefully. */ tp->t_timers->tt_active = 0; callout_stop(&tp->t_timers->tt_timer); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; } tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); in_pcbdrop(inp); tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { m = syncache_pcbcount(); n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + ((m + n) + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); m = syncache_pcbcount(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) inp_list[i++] = inp; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count + pcb_count; INP_INFO_RUNLOCK(&tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, mapped = 0; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_flags = 0; /* IPv4 */ inc.inc_faddr = faddr; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. ip->ip_len has already * been swapped in icmp_input(). */ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); if (mtu < max(296, (tcp_minmss) + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) mtu = tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_UNLOCK(inp); } else { inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; INP_INFO_WLOCK(&tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&tcbinfo); } else in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; tcp_seq tcp_new_isn(struct tcpcb *tp) { u_int32_t md5_buffer[4]; tcp_seq new_isn; INP_LOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); new_isn += isn_offset; ISN_UNLOCK(); return (new_isn); } /* * Increment the offset to the next ISN_BYTES_PER_SECOND / hz boundary * to keep time flowing at a relatively constant rate. If the random * increments have already pushed us past the projected offset, do nothing. */ static void tcp_isn_tick(void *xtp) { u_int32_t projected_offset; ISN_LOCK(); projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100; if (projected_offset > isn_offset) isn_offset = projected_offset; isn_offset_old = isn_offset; callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp; struct socket *so = inp->inp_socket; u_int maxmtu; u_int romtu; int mss; #ifdef INET6 int isipv6; #endif /* INET6 */ INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ romtu = #ifdef INET6 isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) : #endif /* INET6 */ tcp_maxmtu(&inp->inp_inc, NULL); if (!maxmtu) maxmtu = romtu; else maxmtu = min(maxmtu, romtu); if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return (inp); } mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return (inp); tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp); tcp_output(tp); return (inp); } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; rtalloc_ign(&sro, RTF_CLONING); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro.ro_rt); } return (maxmtu); } #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; rtalloc_ign((struct route *)&sro6, RTF_CLONING); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; INP_LOCK_ASSERT(tp->t_inpcb); /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case and stabilization. * Without the slop we could be locking ourselves into a lower * bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). * * (4) Stability value (defaults to 20 = 2 maximal packets) can * be adjusted with a sysctl but typically only needs to be * on very slow connections. A value no smaller then 5 * should be used, but only reduce this default if you have * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * off0 offset to TCP header within the mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * This function is for IPv4 use only. Calling this function with an * IPv6 packet in the mbuf chain will yield undefined results. * * Return 0 if successful, otherwise return -1. * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 * right now. Another branch of this code exists which uses the SPD to * specify per-application flows but it is unstable. */ int tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; struct ippseudo ippseudo; MD5_CTX ctx; int doff; struct ip *ip; struct ipovly *ipovly; struct secasvar *sav; struct tcphdr *th; u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); bzero(&dst, sizeof(union sockaddr_union)); dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { printf("%s: SADB lookup failed for %s\n", __func__, inet_ntoa(dst.sin.sin_addr)); return (EINVAL); } MD5Init(&ctx); ipovly = (struct ipovly *)ip; th = (struct tcphdr *)((u_char *)ip + off0); doff = off0 + sizeof(struct tcphdr) + optlen; /* * Step 1: Update MD5 hash with IP pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, ip6_use_defzone); if (error) return (error); break; #endif case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: return (EINVAL); } INP_INFO_WLOCK(&tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { INP_LOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); } else if (!(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tcp_drop(tp, ECONNABORTED); } INP_UNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && inc->inc_isipv6 == 0) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } Index: head/sys/sys/mutex.h =================================================================== --- head/sys/sys/mutex.h (revision 171636) +++ head/sys/sys/mutex.h (revision 171637) @@ -1,462 +1,461 @@ /*- * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $ * $FreeBSD$ */ #ifndef _SYS_MUTEX_H_ #define _SYS_MUTEX_H_ #ifndef LOCORE #include #include #include #ifdef _KERNEL #include #include #include #include #endif /* _KERNEL_ */ #endif /* !LOCORE */ #include #ifdef _KERNEL /* * Mutex types and options passed to mtx_init(). MTX_QUIET and MTX_DUPOK * can also be passed in. */ #define MTX_DEF 0x00000000 /* DEFAULT (sleep) lock */ #define MTX_SPIN 0x00000001 /* Spin lock (disables interrupts) */ #define MTX_RECURSE 0x00000004 /* Option: lock allowed to recurse */ #define MTX_NOWITNESS 0x00000008 /* Don't do any witness checking. */ #define MTX_NOPROFILE 0x00000020 /* Don't profile this lock */ /* * Option flags passed to certain lock/unlock routines, through the use * of corresponding mtx_{lock,unlock}_flags() interface macros. */ #define MTX_QUIET LOP_QUIET /* Don't log a mutex event */ #define MTX_DUPOK LOP_DUPOK /* Don't log a duplicate acquire */ /* * State bits kept in mutex->mtx_lock, for the DEFAULT lock type. None of this, * with the exception of MTX_UNOWNED, applies to spin locks. */ #define MTX_RECURSED 0x00000001 /* lock recursed (for MTX_DEF only) */ #define MTX_CONTESTED 0x00000002 /* lock contested (for MTX_DEF only) */ #define MTX_UNOWNED 0x00000004 /* Cookie for free mutex */ #define MTX_FLAGMASK (MTX_RECURSED | MTX_CONTESTED | MTX_UNOWNED) /* * Value stored in mutex->mtx_lock to denote a destroyed mutex. */ #define MTX_DESTROYED (MTX_CONTESTED | MTX_UNOWNED) #endif /* _KERNEL */ #ifndef LOCORE /* * XXX: Friendly reminder to fix things in MP code that is presently being * XXX: worked on. */ #define mp_fixme(string) #ifdef _KERNEL /* * Prototypes * * NOTE: Functions prepended with `_' (underscore) are exported to other parts * of the kernel via macros, thus allowing us to use the cpp LOCK_FILE * and LOCK_LINE. These functions should not be called directly by any * code using the API. Their macros cover their functionality. * * [See below for descriptions] * */ void mtx_init(struct mtx *m, const char *name, const char *type, int opts); void mtx_destroy(struct mtx *m); void mtx_sysinit(void *arg); void mutex_init(void); void _mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file, int line); void _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line); #ifdef SMP void _mtx_lock_spin(struct mtx *m, uintptr_t tid, int opts, const char *file, int line); #endif void _mtx_unlock_spin(struct mtx *m, int opts, const char *file, int line); int _mtx_trylock(struct mtx *m, int opts, const char *file, int line); void _mtx_lock_flags(struct mtx *m, int opts, const char *file, int line); void _mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line); void _mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line); void _mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _mtx_assert(struct mtx *m, int what, const char *file, int line); #endif void _thread_lock_flags(struct thread *, int, const char *, int); #define thread_lock(tdp) \ _thread_lock_flags((tdp), 0, __FILE__, __LINE__) #define thread_lock_flags(tdp, opt) \ _thread_lock_flags((tdp), (opt), __FILE__, __LINE__) #define thread_unlock(tdp) \ mtx_unlock_spin((tdp)->td_lock) /* * We define our machine-independent (unoptimized) mutex micro-operations * here, if they are not already defined in the machine-dependent mutex.h */ /* Try to obtain mtx_lock once. */ #ifndef _obtain_lock #define _obtain_lock(mp, tid) \ atomic_cmpset_acq_ptr(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) #endif /* Try to release mtx_lock if it is unrecursed and uncontested. */ #ifndef _release_lock #define _release_lock(mp, tid) \ atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED) #endif /* Release mtx_lock quickly, assuming we own it. */ #ifndef _release_lock_quick #define _release_lock_quick(mp) \ atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED) #endif /* * Obtain a sleep lock inline, or call the "hard" function if we can't get it * easy. */ #ifndef _get_sleep_lock #define _get_sleep_lock(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ if (!_obtain_lock((mp), _tid)) { \ _mtx_lock_sleep((mp), _tid, (opts), (file), (line)); \ } else \ lock_profile_obtain_lock_success(&(mp)->lock_object, 0, \ 0, (file), (line)); \ } while (0) #endif /* * Obtain a spin lock inline, or call the "hard" function if we can't get it * easy. For spinlocks, we handle recursion inline (it turns out that function * calls can be significantly expensive on some architectures). * Since spin locks are not _too_ common, inlining this code is not too big * a deal. */ #ifndef _get_spin_lock #ifdef SMP #define _get_spin_lock(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ spinlock_enter(); \ if (!_obtain_lock((mp), _tid)) { \ if ((mp)->mtx_lock == _tid) \ (mp)->mtx_recurse++; \ else { \ _mtx_lock_spin((mp), _tid, (opts), (file), (line)); \ } \ } else \ lock_profile_obtain_lock_success(&(mp)->lock_object, 0, \ 0, (file), (line)); \ } while (0) #else /* SMP */ #define _get_spin_lock(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ \ spinlock_enter(); \ if ((mp)->mtx_lock == _tid) \ (mp)->mtx_recurse++; \ else { \ KASSERT((mp)->mtx_lock == MTX_UNOWNED, ("corrupt spinlock")); \ (mp)->mtx_lock = _tid; \ } \ } while (0) #endif /* SMP */ #endif /* * Release a sleep lock inline, or call the "hard" function if we can't do it * easy. */ #ifndef _rel_sleep_lock #define _rel_sleep_lock(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ \ if (!_release_lock((mp), _tid)) \ _mtx_unlock_sleep((mp), (opts), (file), (line)); \ } while (0) #endif /* * For spinlocks, we can handle everything inline, as it's pretty simple and * a function call would be too expensive (at least on some architectures). * Since spin locks are not _too_ common, inlining this code is not too big * a deal. * * Since we always perform a spinlock_enter() when attempting to acquire a * spin lock, we need to always perform a matching spinlock_exit() when * releasing a spin lock. This includes the recursion cases. */ #ifndef _rel_spin_lock #ifdef SMP #define _rel_spin_lock(mp) do { \ if (mtx_recursed((mp))) \ (mp)->mtx_recurse--; \ else { \ lock_profile_release_lock(&(mp)->lock_object); \ _release_lock_quick((mp)); \ } \ spinlock_exit(); \ } while (0) #else /* SMP */ #define _rel_spin_lock(mp) do { \ if (mtx_recursed((mp))) \ (mp)->mtx_recurse--; \ else \ (mp)->mtx_lock = MTX_UNOWNED; \ spinlock_exit(); \ } while (0) #endif /* SMP */ #endif /* * Exported lock manipulation interface. * * mtx_lock(m) locks MTX_DEF mutex `m' * * mtx_lock_spin(m) locks MTX_SPIN mutex `m' * * mtx_unlock(m) unlocks MTX_DEF mutex `m' * * mtx_unlock_spin(m) unlocks MTX_SPIN mutex `m' * * mtx_lock_spin_flags(m, opts) and mtx_lock_flags(m, opts) locks mutex `m' * and passes option flags `opts' to the "hard" function, if required. * With these routines, it is possible to pass flags such as MTX_QUIET * to the appropriate lock manipulation routines. * * mtx_trylock(m) attempts to acquire MTX_DEF mutex `m' but doesn't sleep if * it cannot. Rather, it returns 0 on failure and non-zero on success. * It does NOT handle recursion as we assume that if a caller is properly * using this part of the interface, he will know that the lock in question * is _not_ recursed. * * mtx_trylock_flags(m, opts) is used the same way as mtx_trylock() but accepts * relevant option flags `opts.' * * mtx_initialized(m) returns non-zero if the lock `m' has been initialized. * * mtx_owned(m) returns non-zero if the current thread owns the lock `m' * * mtx_recursed(m) returns non-zero if the lock `m' is presently recursed. */ #define mtx_lock(m) mtx_lock_flags((m), 0) #define mtx_lock_spin(m) mtx_lock_spin_flags((m), 0) #define mtx_trylock(m) mtx_trylock_flags((m), 0) #define mtx_unlock(m) mtx_unlock_flags((m), 0) #define mtx_unlock_spin(m) mtx_unlock_spin_flags((m), 0) struct mtx_pool; struct mtx_pool *mtx_pool_create(const char *mtx_name, int pool_size, int opts); void mtx_pool_destroy(struct mtx_pool **poolp); struct mtx *mtx_pool_find(struct mtx_pool *pool, void *ptr); struct mtx *mtx_pool_alloc(struct mtx_pool *pool); #define mtx_pool_lock(pool, ptr) \ mtx_lock(mtx_pool_find((pool), (ptr))) #define mtx_pool_lock_spin(pool, ptr) \ mtx_lock_spin(mtx_pool_find((pool), (ptr))) #define mtx_pool_unlock(pool, ptr) \ mtx_unlock(mtx_pool_find((pool), (ptr))) #define mtx_pool_unlock_spin(pool, ptr) \ mtx_unlock_spin(mtx_pool_find((pool), (ptr))) /* * mtxpool_lockbuilder is a pool of sleep locks that is not witness * checked and should only be used for building higher level locks. * * mtxpool_sleep is a general purpose pool of sleep mutexes. */ extern struct mtx_pool *mtxpool_lockbuilder; extern struct mtx_pool *mtxpool_sleep; #ifndef LOCK_DEBUG #error LOCK_DEBUG not defined, include before #endif #if LOCK_DEBUG > 0 || defined(MUTEX_NOINLINE) #define mtx_lock_flags(m, opts) \ _mtx_lock_flags((m), (opts), LOCK_FILE, LOCK_LINE) #define mtx_unlock_flags(m, opts) \ _mtx_unlock_flags((m), (opts), LOCK_FILE, LOCK_LINE) #define mtx_lock_spin_flags(m, opts) \ _mtx_lock_spin_flags((m), (opts), LOCK_FILE, LOCK_LINE) #define mtx_unlock_spin_flags(m, opts) \ _mtx_unlock_spin_flags((m), (opts), LOCK_FILE, LOCK_LINE) #else /* LOCK_DEBUG == 0 && !MUTEX_NOINLINE */ #define mtx_lock_flags(m, opts) \ _get_sleep_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE) #define mtx_unlock_flags(m, opts) \ _rel_sleep_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE) #define mtx_lock_spin_flags(m, opts) \ _get_spin_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE) #define mtx_unlock_spin_flags(m, opts) \ _rel_spin_lock((m)) #endif /* LOCK_DEBUG > 0 || MUTEX_NOINLINE */ #define mtx_trylock_flags(m, opts) \ _mtx_trylock((m), (opts), LOCK_FILE, LOCK_LINE) #define mtx_sleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo)) #define mtx_initialized(m) lock_initalized(&(m)->lock_object) #define mtx_owned(m) (((m)->mtx_lock & ~MTX_FLAGMASK) == (uintptr_t)curthread) #define mtx_recursed(m) ((m)->mtx_recurse != 0) #define mtx_name(m) ((m)->lock_object.lo_name) /* * Global locks. */ extern struct mtx Giant; extern struct mtx blocked_lock; /* * Giant lock manipulation and clean exit macros. * Used to replace return with an exit Giant and return. * * Note that DROP_GIANT*() needs to be paired with PICKUP_GIANT() * The #ifndef is to allow lint-like tools to redefine DROP_GIANT. */ #ifndef DROP_GIANT #define DROP_GIANT() \ do { \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant); \ \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++) \ mtx_unlock(&Giant); \ } #define PICKUP_GIANT() \ PARTIAL_PICKUP_GIANT(); \ } while (0) #define PARTIAL_PICKUP_GIANT() \ mtx_assert(&Giant, MA_NOTOWNED); \ if (_giantcnt > 0) { \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } #endif #define UGAR(rval) do { \ int _val = (rval); \ mtx_unlock(&Giant); \ return (_val); \ } while (0) /* * With the advent of fine-grained locking, the Giant lock is no longer * required around the network stack. These macros exist for historical * reasons, allowing conditional acquisition of Giant based on a debugging * setting, and will be removed. */ #define NET_LOCK_GIANT() do { \ } while (0) #define NET_UNLOCK_GIANT() do { \ } while (0) #define NET_ASSERT_GIANT() do { \ } while (0) -#define NET_CALLOUT_MPSAFE CALLOUT_MPSAFE struct mtx_args { struct mtx *ma_mtx; const char *ma_desc; int ma_opts; }; #define MTX_SYSINIT(name, mtx, desc, opts) \ static struct mtx_args name##_args = { \ (mtx), \ (desc), \ (opts) \ }; \ SYSINIT(name##_mtx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ mtx_sysinit, &name##_args); \ SYSUNINIT(name##_mtx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ mtx_destroy, (mtx)) /* * The INVARIANTS-enabled mtx_assert() functionality. * * The constants need to be defined for INVARIANT_SUPPORT infrastructure * support as _mtx_assert() itself uses them and the latter implies that * _mtx_assert() must build. */ #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) #define MA_OWNED 0x01 #define MA_NOTOWNED 0x02 #define MA_RECURSED 0x04 #define MA_NOTRECURSED 0x08 #endif #ifdef INVARIANTS #define mtx_assert(m, what) \ _mtx_assert((m), (what), __FILE__, __LINE__) #define GIANT_REQUIRED mtx_assert(&Giant, MA_OWNED) #else /* INVARIANTS */ #define mtx_assert(m, what) #define GIANT_REQUIRED #endif /* INVARIANTS */ /* * Common lock type names. */ #define MTX_NETWORK_LOCK "network driver" #endif /* _KERNEL */ #endif /* !LOCORE */ #endif /* _SYS_MUTEX_H_ */