Index: head/sys/netinet/ip_dummynet.c =================================================================== --- head/sys/netinet/ip_dummynet.c (revision 150349) +++ head/sys/netinet/ip_dummynet.c (revision 150350) @@ -1,2144 +1,2144 @@ /*- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DUMMYNET_DEBUG #if !defined(KLD_MODULE) #include "opt_inet6.h" #endif /* * This module implements IP dummynet, a bandwidth limiter/delay emulator * used in conjunction with the ipfw package. * Description of the data structures used is in ip_dummynet.h * Here you mainly find the following blocks of code: * + variable declarations; * + heap management functions; * + scheduler and dummynet functions; * + configuration and initialization. * * NOTA BENE: critical sections are protected by the "dummynet lock". * * Most important Changes: * * 011004: KLDable * 010124: Fixed WF2Q behaviour * 010122: Fixed spl protection. * 000601: WF2Q support * 000106: large rewrite, use heaps to handle very many pipes. * 980513: initial release * * include files marked with XXX are probably not needed */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for struct arpcom */ #include #include /* for ip6_input, ip6_output prototypes */ #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) */ static dn_key curr_time = 0 ; /* current simulation time */ static int dn_hash_size = 64 ; /* default hash size */ /* statistics on number of queue searches and search steps */ static int searches, search_steps ; static int pipe_expire = 1 ; /* expire queue if empty */ static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ /* * Three heaps contain queues and pipes that the scheduler handles: * * ready_heap contains all dn_flow_queue related to fixed-rate pipes. * * wfq_ready_heap contains the pipes associated with WF2Q flows * * extract_heap contains pipes associated with delay lines. * */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; static int heap_init(struct dn_heap *h, int size) ; static int heap_insert (struct dn_heap *h, dn_key key1, void *p); static void heap_extract(struct dn_heap *h, void *obj); static void transmit_event(struct dn_pipe *pipe); static void ready_event(struct dn_flow_queue *q); static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ static struct callout dn_timeout; extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, CTLFLAG_RD, &curr_time, 0, "Current tick"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, CTLFLAG_RD, &searches, 0, "Number of queue searches"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, CTLFLAG_RW, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); #endif #ifdef DUMMYNET_DEBUG int dummynet_debug = 0; #ifdef SYSCTL_NODE SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, 0, "control debugging printfs"); #endif #define DPRINTF(X) if (dummynet_debug) printf X #else #define DPRINTF(X) #endif static struct mtx dummynet_mtx; /* * NB: Recursion is needed to deal with re-entry via ICMP. That is, * a packet may be dispatched via ip_input from dummynet_io and * re-enter through ip_output. Yech. */ #define DUMMYNET_LOCK_INIT() \ mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF | MTX_RECURSE) #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx) #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx) #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx) #define DUMMYNET_LOCK_ASSERT() do { \ mtx_assert(&dummynet_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); static void dummynet(void *); static void dummynet_flush(void); void dummynet_drain(void); static ip_dn_io_t dummynet_io; static void dn_rule_delete(void *); int if_tx_rdy(struct ifnet *ifp); /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * XXX failure to allocate a new element is a pretty bad failure * as we basically stall a whole queue forever!! * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( 2*(x) + 1 ) #define HEAP_IS_LEFT(x) ( (x) & 1 ) #define HEAP_RIGHT(x) ( 2*(x) + 2 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_init(struct dn_heap *h, int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) { printf("dummynet: %s, Bogus call, have %d want %d\n", __func__, h->size, new_size); return 0 ; } new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT); if (p == NULL) { printf("dummynet: %s, resize %d failed\n", __func__, new_size ); return 1 ; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DUMMYNET); } h->p = p ; h->size = new_size ; return 0 ; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If offset > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; /* * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. */ #define RESET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; static int heap_insert(struct dn_heap *h, dn_key key1, void *p) { int son = h->elements ; if (p == NULL) /* data already there, set starting point */ son = key1 ; else { /* insert new element at the end, possibly resize */ son = h->elements ; if (son == h->size) /* need resize... */ if (heap_init(h, h->elements+1) ) return 1 ; /* failure... */ h->p[son].object = p ; h->p[son].key = key1 ; h->elements++ ; } while (son > 0) { /* bubble up */ int father = HEAP_FATHER(son) ; struct dn_heap_entry tmp ; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break ; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp) ; SET_OFFSET(h, son); son = father ; } SET_OFFSET(h, son); return 0 ; } /* * remove top element from heap, or obj if obj != NULL */ static void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1 ; if (max < 0) { printf("dummynet: warning, extract from empty heap 0x%p\n", h); return ; } father = 0 ; /* default: move up smallest child */ if (obj != NULL) { /* extract specific element, index is at offset */ if (h->offset <= 0) panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); father = *((int *)((char *)obj + h->offset)) ; if (father < 0 || father >= h->elements) { printf("dummynet: heap_extract, father %d out of bound 0..%d\n", father, h->elements); panic("dummynet: heap_extract"); } } RESET_OFFSET(h, father); child = HEAP_LEFT(father) ; /* left child */ while (child <= max) { /* valid entry */ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child = child+1 ; /* take right child, otherwise left */ h->p[father] = h->p[child] ; SET_OFFSET(h, father); father = child ; child = HEAP_LEFT(child) ; /* left child for next loop */ } h->elements-- ; if (father != max) { /* * Fill hole with last entry and bubble up, reusing the insert code */ h->p[father] = h->p[max] ; heap_insert(h, father, NULL); /* this one cannot fail */ } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, dn_key new_key, void *object) { int temp; int i ; int max = h->elements-1 ; struct dn_heap_entry buf ; if (h->offset <= 0) panic("cannot move items on this heap"); i = *((int *)((char *)object + h->offset)); if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ h->p[i].key = new_key ; for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; i = temp ) { /* bubble up */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } } else { /* must move down */ h->p[i].key = new_key ; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) temp++ ; /* select child with min key */ if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } else break ; i = temp ; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i ; for (i = 0 ; i < h->elements ; i++ ) heap_insert(h, i , NULL) ; } /* * cleanup the heap and free data structure */ static void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DUMMYNET); bzero(h, sizeof(*h) ); } /* * --- end of heap management functions --- */ /* * Return the mbuf tag holding the dummynet state. As an optimization * this is assumed to be the first tag on the list. If this turns out * wrong we'll need to search the list. */ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } /* * Scheduler functions: * * transmit_event() is called when the delay-line needs to enter * the scheduler, either because of existing pkts getting ready, * or new packets entering the queue. The event handled is the delivery * time of the packet. * * ready_event() does something similar with fixed-rate queues, and the * event handled is the finish time of the head pkt. * * wfq_ready_event() does something similar with WF2Q queues, and the * event handled is the start time of the head pkt. * * In all cases, we make sure that the data structures are consistent * before passing pkts out, because this might trigger recursive * invocations of the procedures. */ static void transmit_event(struct dn_pipe *pipe) { struct mbuf *m ; struct dn_pkt_tag *pkt ; struct ip *ip; DUMMYNET_LOCK_ASSERT(); while ( (m = pipe->head) ) { pkt = dn_tag_get(m); if ( !DN_KEY_LEQ(pkt->output_time, curr_time) ) break; /* * first unlink, then call procedures, since ip_input() can invoke * ip_output() and viceversa, thus causing nested calls */ pipe->head = m->m_nextpkt ; m->m_nextpkt = NULL; /* XXX: drop the lock for now to avoid LOR's */ DUMMYNET_UNLOCK(); switch (pkt->dn_dir) { case DN_TO_IP_OUT: (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL); break ; case DN_TO_IP_IN : ip = mtod(m, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip_input(m) ; break ; #ifdef INET6 case DN_TO_IP6_IN: ip6_input(m) ; break ; case DN_TO_IP6_OUT: (void)ip6_output(m, NULL, NULL, pkt->flags, NULL, NULL, NULL); break ; #endif case DN_TO_IFB_FWD: if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, pkt->ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DN_TO_BDG_FWD : /* * The bridge requires/assumes the Ethernet header is * contiguous in the first mbuf header. Insure this is true. */ if (BDG_LOADED) { if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/bridge: pullup fail, dropping pkt\n"); break; } m = bdg_forward_ptr(m, pkt->ifp); } else { /* somebody unloaded the bridge module. Drop pkt */ /* XXX rate limit */ printf("dummynet: dropping bridged packet trapped in pipe\n"); } if (m) m_freem(m); break; case DN_TO_ETH_DEMUX: /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup fail, dropping pkt\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); /* which consumes the mbuf */ break ; case DN_TO_ETH_OUT: ether_output_frame(pkt->ifp, m); break; default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(m); break ; } DUMMYNET_LOCK(); } /* if there are leftover packets, put into the heap for next event */ if ( (m = pipe->head) ) { pkt = dn_tag_get(m) ; /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ heap_insert(&extract_heap, pkt->output_time, pipe ) ; } } /* * the following macro computes how many ticks we have to wait * before being able to transmit a packet. The credit is taken from * either a pipe (WF2Q) or a flow_queue (per-flow queueing) */ #define SET_TICKS(_m, q, p) \ ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ p->bandwidth ; /* * extract pkt from queue, compute output time (could be now) * and put into delay line (p_queue) */ static void move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p, int len) { struct dn_pkt_tag *dt = dn_tag_get(pkt); q->head = pkt->m_nextpkt ; q->len-- ; q->len_bytes -= len ; dt->output_time = curr_time + p->delay ; if (p->head == NULL) p->head = pkt; else p->tail->m_nextpkt = pkt; p->tail = pkt; p->tail->m_nextpkt = NULL; } /* * ready_event() is invoked every time the queue must enter the * scheduler, either because the first packet arrives, or because * a previously scheduled event fired. * On invokation, drain as many pkts as possible (could be 0) and then * if there are leftover packets reinsert the pkt in the scheduler. */ static void ready_event(struct dn_flow_queue *q) { struct mbuf *pkt; struct dn_pipe *p = q->fs->pipe ; int p_was_empty ; DUMMYNET_LOCK_ASSERT(); if (p == NULL) { printf("dummynet: ready_event- pipe is gone\n"); return ; } p_was_empty = (p->head == NULL) ; /* * schedule fixed-rate queues linked to this pipe: * Account for the bw accumulated since last scheduling, then * drain as many pkts as allowed by q->numbytes and move to * the delay line (in p) computing output time. * bandwidth==0 (no limit) means we can drain the whole queue, * setting len_scaled = 0 does the job. */ q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; while ( (pkt = q->head) != NULL ) { int len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; if (len_scaled > q->numbytes ) break ; q->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); } /* * If we have more packets queued, schedule next ready event * (can only occur when bandwidth != 0, otherwise we would have * flushed the whole queue in the previous loop). * To this purpose we record the current time and compute how many * ticks to go for the finish time of the packet. */ if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ q->sched_time = curr_time ; heap_insert(&ready_heap, curr_time + t, (void *)q ); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } else { /* RED needs to know when the queue becomes empty */ q->q_time = curr_time; q->numbytes = 0; } /* * If the delay line was empty call transmit_event(p) now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p); } /* * Called when we can transmit packets on WF2Q queues. Take pkts out of * the queues at their start time, and enqueue into the delay line. * Packets are drained until p->numbytes < 0. As long as * len_scaled >= p->numbytes, the packet goes into the delay line * with a deadline p->delay. For the last packet, if p->numbytes<0, * there is an additional delay. */ static void ready_event_wfq(struct dn_pipe *p) { int p_was_empty = (p->head == NULL) ; struct dn_heap *sch = &(p->scheduler_heap); struct dn_heap *neh = &(p->not_eligible_heap) ; DUMMYNET_LOCK_ASSERT(); if (p->if_name[0] == 0) /* tx clock is simulated */ p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; else { /* tx clock is for real, the ifq must be empty or this is a NOP */ if (p->ifp && p->ifp->if_snd.ifq_head != NULL) return ; else { DPRINTF(("dummynet: pipe %d ready from %s --\n", p->pipe_nr, p->if_name)); } } /* * While we have backlogged traffic AND credit, we need to do * something on the queue. */ while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { if (sch->elements > 0) { /* have some eligible pkts to send out */ struct dn_flow_queue *q = sch->p[0].object ; struct mbuf *pkt = q->head; struct dn_flow_set *fs = q->fs; u_int64_t len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; heap_extract(sch, NULL); /* remove queue from heap */ p->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); p->V += (len<sum ; /* update V */ q->S = q->F ; /* update start time */ if (q->len == 0) { /* Flow not backlogged any more */ fs->backlogged-- ; heap_insert(&(p->idle_heap), q->F, q); } else { /* still backlogged */ /* * update F and position in backlogged queue, then * put flow in not_eligible_heap (we will fix this later). */ len = (q->head)->m_pkthdr.len; q->F += (len<weight ; if (DN_KEY_LEQ(q->S, p->V)) heap_insert(neh, q->S, q); else heap_insert(sch, q->F, q); } } /* * now compute V = max(V, min(S_i)). Remember that all elements in sch * have by definition S_i <= V so if sch is not empty, V is surely * the max and we must not update it. Conversely, if sch is empty * we only need to look at neh. */ if (sch->elements == 0 && neh->elements > 0) p->V = MAX64 ( p->V, neh->p[0].key ); /* move from neh to sch any packets that have become eligible */ while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { struct dn_flow_queue *q = neh->p[0].object ; heap_extract(neh, NULL); heap_insert(sch, q->F, q); } if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ p->numbytes = -1 ; /* mark not ready for I/O */ break ; } } if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 && p->idle_heap.elements > 0) { /* * no traffic and no events scheduled. We can get rid of idle-heap. */ int i ; for (i = 0 ; i < p->idle_heap.elements ; i++) { struct dn_flow_queue *q = p->idle_heap.p[i].object ; q->F = 0 ; q->S = q->F + 1 ; } p->sum = 0 ; p->V = 0 ; p->idle_heap.elements = 0 ; } /* * If we are getting clocks from dummynet (not a real interface) and * If we are under credit, schedule the next ready event. * Also fix the delivery time of the last packet. */ if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ dn_key t=0 ; /* number of ticks i have to wait */ if (p->bandwidth > 0) t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; dn_tag_get(p->tail)->output_time += t ; p->sched_time = curr_time ; heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } /* * If the delay line was empty call transmit_event(p) now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p); } /* * This is called once per tick, or HZ times per second. It is used to * increment the current tick counter and schedule expired events. */ static void dummynet(void * __unused unused) { void *p ; /* generic parameter to handler */ struct dn_heap *h ; struct dn_heap *heaps[3]; int i; struct dn_pipe *pe ; heaps[0] = &ready_heap ; /* fixed-rate queues */ heaps[1] = &wfq_ready_heap ; /* wfq queues */ heaps[2] = &extract_heap ; /* delay line */ DUMMYNET_LOCK(); curr_time++ ; for (i=0; i < 3 ; i++) { h = heaps[i]; while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { if (h->p[0].key > curr_time) printf("dummynet: warning, heap %d is %d ticks late\n", i, (int)(curr_time - h->p[0].key)); p = h->p[0].object ; /* store a copy before heap_extract */ heap_extract(h, NULL); /* need to extract before processing */ if (i == 0) ready_event(p) ; else if (i == 1) { struct dn_pipe *pipe = p; if (pipe->if_name[0] != '\0') printf("dummynet: bad ready_event_wfq for pipe %s\n", pipe->if_name); else ready_event_wfq(p) ; } else transmit_event(p); } } /* sweep pipes trying to expire idle flow_queues */ for (pe = all_pipes; pe ; pe = pe->next ) if (pe->idle_heap.elements > 0 && DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { struct dn_flow_queue *q = pe->idle_heap.p[0].object ; heap_extract(&(pe->idle_heap), NULL); q->S = q->F + 1 ; /* mark timestamp as invalid */ pe->sum -= q->fs->weight ; } DUMMYNET_UNLOCK(); callout_reset(&dn_timeout, 1, dummynet, NULL); } /* * called by an interface when tx_rdy occurs. */ int if_tx_rdy(struct ifnet *ifp) { struct dn_pipe *p; DUMMYNET_LOCK(); for (p = all_pipes; p ; p = p->next ) if (p->ifp == ifp) break ; if (p == NULL) { for (p = all_pipes; p ; p = p->next ) if (!strcmp(p->if_name, ifp->if_xname) ) { p->ifp = ifp ; DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", ifp->if_xname)); break ; } } if (p != NULL) { DPRINTF(("dummynet: ++ tx rdy from %s - qlen %d\n", ifp->if_xname, ifp->if_snd.ifq_len)); p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p); } DUMMYNET_UNLOCK(); return 0; } /* * Unconditionally expire empty queues in case of shortage. * Returns the number of queues freed. */ static int expire_queues(struct dn_flow_set *fs) { struct dn_flow_queue *q, *prev ; int i, initial_elements = fs->rq_elements ; - if (fs->last_expired == time_second) + if (fs->last_expired == time_uptime) return 0 ; - fs->last_expired = time_second ; + fs->last_expired = time_uptime ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) if (q->head != NULL || q->S != q->F+1) { prev = q ; q = q->next ; } else { /* entry is idle, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); } return initial_elements - fs->rq_elements ; } /* * If room, create a new queue and put at head of slot i; * otherwise, create or use the default queue. */ static struct dn_flow_queue * create_queue(struct dn_flow_set *fs, int i) { struct dn_flow_queue *q ; if (fs->rq_elements > fs->rq_size * dn_max_ratio && expire_queues(fs) == 0) { /* * No way to get room, use or create overflow queue. */ i = fs->rq_size ; if ( fs->rq[i] != NULL ) return fs->rq[i] ; } q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO); if (q == NULL) { printf("dummynet: sorry, cannot allocate queue for new flow\n"); return NULL ; } q->fs = fs ; q->hash_slot = i ; q->next = fs->rq[i] ; q->S = q->F + 1; /* hack - mark timestamp as invalid */ fs->rq[i] = q ; fs->rq_elements++ ; return q ; } /* * Given a flow_set and a pkt in last_pkt, find a matching queue * after appropriate masking. The queue is moved to front * so that further searches take less time. */ static struct dn_flow_queue * find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; int is_v6 = IS_IP6_FLOW_ID(id); if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) q = fs->rq[0] ; else { /* first, do the masking, then hash */ id->dst_port &= fs->flow_mask.dst_port ; id->src_port &= fs->flow_mask.src_port ; id->proto &= fs->flow_mask.proto ; id->flags = 0 ; /* we don't care about this one */ if (is_v6) { APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); id->flow_id6 &= fs->flow_mask.flow_id6; i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto ) ^ (id->flow_id6); } else { id->dst_ip &= fs->flow_mask.dst_ip ; id->src_ip &= fs->flow_mask.src_ip ; i = ( (id->dst_ip) & 0xffff ) ^ ( (id->dst_ip >> 15) & 0xffff ) ^ ( (id->src_ip << 1) & 0xffff ) ^ ( (id->src_ip >> 16 ) & 0xffff ) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto ); } i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; if (is_v6 && IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && id->dst_port == q->id.dst_port && id->src_port == q->id.src_port && id->proto == q->id.proto && id->flags == q->id.flags && id->flow_id6 == q->id.flow_id6) break ; /* found */ if (!is_v6 && id->dst_ip == q->id.dst_ip && id->src_ip == q->id.src_ip && id->dst_port == q->id.dst_port && id->src_port == q->id.src_port && id->proto == q->id.proto && id->flags == q->id.flags) break ; /* found */ /* No match. Check if we can expire the entry */ if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { /* entry is idle and not in any heap, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); continue ; } prev = q ; q = q->next ; } if (q && prev != NULL) { /* found and not in front */ prev->next = q->next ; q->next = fs->rq[i] ; fs->rq[i] = q ; } } if (q == NULL) { /* no match, need to allocate a new entry */ q = create_queue(fs, i); if (q != NULL) q->id = *id ; } return q ; } static int red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. * */ int64_t p_b = 0; /* queue in bytes or packets ? */ u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size)); /* average queue size estimation */ if (q_size != 0) { /* * queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); q->avg += (int) v; } else { /* * queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = (curr_time - q->q_time) / fs->lookup_step; q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); /* should i drop ? */ if (q->avg < fs->min_th) { q->count = -1; return 0; /* accept packet ; */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->flags_fs & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than max_th the * packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; } else { q->count = -1; DPRINTF(("dummynet: - drop")); return 1 ; } } else if (q->avg > fs->min_th) { /* * we compute p_b using the linear dropping function p_b = c_1 * * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = * max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; } if (fs->flags_fs & DN_QSIZE_IS_BYTES) p_b = (p_b * len) / fs->max_pkt_size; if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { q->count = 0; DPRINTF(("dummynet: - red drop")); /* after a drop we calculate a new random value */ q->random = random() & 0xffff; return 1; /* drop */ } } /* end of RED algorithm */ return 0 ; /* accept */ } static __inline struct dn_flow_set * locate_flowset(int pipe_nr, struct ip_fw *rule) { struct dn_flow_set *fs; ipfw_insn *cmd = ACTION_PTR(rule); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); #ifdef __i386__ fs = ((ipfw_insn_pipe *)cmd)->pipe_ptr; #else bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs)); #endif if (fs != NULL) return fs; if (cmd->opcode == O_QUEUE) for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) ; else { struct dn_pipe *p1; for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) ; if (p1 != NULL) fs = &(p1->fs) ; } /* record for the future */ #ifdef __i386__ ((ipfw_insn_pipe *)cmd)->pipe_ptr = fs; #else bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs)); #endif return fs ; } /* * dummynet hook for packets. Below 'pipe' is a pipe or a queue * depending on whether WF2Q or fixed bw is used. * * pipe_nr pipe or queue the packet is destined for. * dir where shall we send the packet after dummynet. * m the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, * real_dst in bdg_forward * rule matching rule, in case of multiple passes * flags flags from the caller, only used in ip_output * */ static int dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct dn_pkt_tag *pkt; struct m_tag *mtag; struct dn_flow_set *fs; struct dn_pipe *pipe ; u_int64_t len = m->m_pkthdr.len ; struct dn_flow_queue *q = NULL ; int is_pipe; ipfw_insn *cmd = ACTION_PTR(fwa->rule); KASSERT(m->m_nextpkt == NULL, ("dummynet_io: mbuf queue passed to dummynet")); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); is_pipe = (cmd->opcode == O_PIPE); DUMMYNET_LOCK(); /* * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. */ fs = locate_flowset(fwa->cookie, fwa->rule); if (fs == NULL) goto dropit ; /* this queue/pipe does not exist! */ pipe = fs->pipe ; if (pipe == NULL) { /* must be a queue, try find a matching pipe */ for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; pipe = pipe->next) ; if (pipe != NULL) fs->pipe = pipe ; else { printf("dummynet: no pipe %d for queue %d, drop pkt\n", fs->parent_nr, fs->fs_nr); goto dropit ; } } q = find_queue(fs, &(fwa->f_id)); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ /* * update statistics, then check reasons to drop pkt */ q->tot_bytes += len ; q->tot_pkts++ ; if ( fs->plr && random() < fs->plr ) goto dropit ; /* random pkt drop */ if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { if (q->len_bytes > fs->qsize) goto dropit ; /* queue size overflow */ } else { if (q->len >= fs->qsize) goto dropit ; /* queue count overflow */ } if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) goto dropit ; /* XXX expensive to zero, see if we can remove it*/ mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO); if ( mtag == NULL ) goto dropit ; /* cannot allocate packet header */ m_tag_prepend(m, mtag); /* attach to mbuf chain */ pkt = (struct dn_pkt_tag *)(mtag+1); /* ok, i can handle the pkt now... */ /* build and enqueue packet + parameters */ pkt->rule = fwa->rule ; pkt->dn_dir = dir ; pkt->ifp = fwa->oif; if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT) pkt->flags = fwa->flags; if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->tail = m; q->len++; q->len_bytes += len ; if ( q->head != m ) /* flow was not idle, we are done */ goto done; /* * If we reach this point the flow was previously idle, so we need * to schedule it. This involves different actions for fixed-rate or * WF2Q queues. */ if (is_pipe) { /* * Fixed-rate queue: just insert into the ready_heap. */ dn_key t = 0 ; if (pipe->bandwidth) t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; if (t == 0) /* must process it now */ ready_event( q ); else heap_insert(&ready_heap, curr_time + t , q ); } else { /* * WF2Q. First, compute start time S: if the flow was idle (S=F+1) * set S to the virtual time V for the controlling pipe, and update * the sum of weights for the pipe; otherwise, remove flow from * idle_heap and set S to max(F,V). * Second, compute finish time F = S + len/weight. * Third, if pipe was idle, update V=max(S, V). * Fourth, count one more backlogged flow. */ if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ q->S = pipe->V ; pipe->sum += fs->weight ; /* add weight of new queue */ } else { heap_extract(&(pipe->idle_heap), q); q->S = MAX64(q->F, pipe->V ) ; } q->F = q->S + ( len<weight; if (pipe->not_eligible_heap.elements == 0 && pipe->scheduler_heap.elements == 0) pipe->V = MAX64 ( q->S, pipe->V ); fs->backlogged++ ; /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the scheduler_heap cannot be * empty). If the flow is not eligible we just store it in the * not_eligible_heap. Otherwise, we store in the scheduler_heap * and possibly invoke ready_event_wfq() right now if there is * leftover credit. * Note that for all flows in scheduler_heap (SCH), S_i <= V, * and for all flows in not_eligible_heap (NEH), S_i > V . * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, * we only need to look into NEH. */ if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ if (pipe->scheduler_heap.elements == 0) printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); heap_insert(&(pipe->not_eligible_heap), q->S, q); } else { heap_insert(&(pipe->scheduler_heap), q->F, q); if (pipe->numbytes >= 0) { /* pipe is idle */ if (pipe->scheduler_heap.elements != 1) printf("dummynet: OUCH! pipe should have been idle!\n"); DPRINTF(("dummynet: waking up pipe %d at %d\n", pipe->pipe_nr, (int)(q->F >> MY_M))); pipe->sched_time = curr_time ; ready_event_wfq(pipe); } } } done: DUMMYNET_UNLOCK(); return 0; dropit: if (q) q->drops++ ; DUMMYNET_UNLOCK(); m_freem(m); return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); } /* * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) * Doing this would probably save us the initial bzero of dn_pkt */ #define DN_FREE_PKT(_m) do { \ m_freem(_m); \ } while (0) /* * Dispose all packets and flow_queues on a flow_set. * If all=1, also remove red lookup table and other storage, * including the descriptor itself. * For the one in dn_pipe MUST also cleanup ready_heap... */ static void purge_flow_set(struct dn_flow_set *fs, int all) { struct dn_flow_queue *q, *qn ; int i ; DUMMYNET_LOCK_ASSERT(); for (i = 0 ; i <= fs->rq_size ; i++ ) { for (q = fs->rq[i] ; q ; q = qn ) { struct mbuf *m, *mnext; mnext = q->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } qn = q->next ; free(q, M_DUMMYNET); } fs->rq[i] = NULL ; } fs->rq_elements = 0 ; if (all) { /* RED - free lookup table */ if (fs->w_q_lookup) free(fs->w_q_lookup, M_DUMMYNET); if (fs->rq) free(fs->rq, M_DUMMYNET); /* if this fs is not part of a pipe, free it */ if (fs->pipe && fs != &(fs->pipe->fs) ) free(fs, M_DUMMYNET); } } /* * Dispose all packets queued on a pipe (not a flow_set). * Also free all resources associated to a pipe, which is about * to be deleted. */ static void purge_pipe(struct dn_pipe *pipe) { struct mbuf *m, *mnext; purge_flow_set( &(pipe->fs), 1 ); mnext = pipe->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } heap_free( &(pipe->scheduler_heap) ); heap_free( &(pipe->not_eligible_heap) ); heap_free( &(pipe->idle_heap) ); } /* * Delete all pipes and heaps returning memory. Must also * remove references from all ipfw rules to all pipes. */ static void dummynet_flush(void) { struct dn_pipe *curr_p, *p ; struct dn_flow_set *fs, *curr_fs; DUMMYNET_LOCK(); /* remove all references to pipes ...*/ flush_pipe_ptrs(NULL); /* prevent future matches... */ p = all_pipes ; all_pipes = NULL ; fs = all_flow_sets ; all_flow_sets = NULL ; /* and free heaps so we don't have unwanted events */ heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); /* * Now purge all queued pkts and delete all pipes */ /* scan and purge all flow_sets. */ for ( ; fs ; ) { curr_fs = fs ; fs = fs->next ; purge_flow_set(curr_fs, 1); } for ( ; p ; ) { purge_pipe(p); curr_p = p ; p = p->next ; free(curr_p, M_DUMMYNET); } DUMMYNET_UNLOCK(); } extern struct ip_fw *ip_fw_default_rule ; static void dn_rule_delete_fs(struct dn_flow_set *fs, void *r) { int i ; struct dn_flow_queue *q ; struct mbuf *m ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ for (q = fs->rq[i] ; q ; q = q->next ) for (m = q->head ; m ; m = m->m_nextpkt ) { struct dn_pkt_tag *pkt = dn_tag_get(m) ; if (pkt->rule == r) pkt->rule = ip_fw_default_rule ; } } /* * when a firewall rule is deleted, scan all queues and remove the flow-id * from packets matching this rule. */ void dn_rule_delete(void *r) { struct dn_pipe *p ; struct dn_flow_set *fs ; struct dn_pkt_tag *pkt ; struct mbuf *m ; DUMMYNET_LOCK(); /* * If the rule references a queue (dn_flow_set), then scan * the flow set, otherwise scan pipes. Should do either, but doing * both does not harm. */ for ( fs = all_flow_sets ; fs ; fs = fs->next ) dn_rule_delete_fs(fs, r); for ( p = all_pipes ; p ; p = p->next ) { fs = &(p->fs) ; dn_rule_delete_fs(fs, r); for (m = p->head ; m ; m = m->m_nextpkt ) { pkt = dn_tag_get(m) ; if (pkt->rule == r) pkt->rule = ip_fw_default_rule ; } } DUMMYNET_UNLOCK(); } /* * setup RED parameters */ static int config_red(struct dn_flow_set *p, struct dn_flow_set * x) { int i; x->w_q = p->w_q; x->min_th = SCALE(p->min_th); x->max_th = SCALE(p->max_th); x->max_p = p->max_p; x->c_1 = p->max_p / (p->max_th - p->min_th); x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); if (x->flags_fs & DN_IS_GENTLE_RED) { x->c_3 = (SCALE(1) - p->max_p) / p->max_th; x->c_4 = (SCALE(1) - 2 * p->max_p); } /* if the lookup table already exist, free and create it again */ if (x->w_q_lookup) { free(x->w_q_lookup, M_DUMMYNET); x->w_q_lookup = NULL ; } if (red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n"); free(x, M_DUMMYNET); return EINVAL; } x->lookup_depth = red_lookup_depth; x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); if (x->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); free(x, M_DUMMYNET); return ENOSPC; } /* fill the lookup table with (1 - w_q)^x */ x->lookup_step = p->lookup_step ; x->lookup_weight = p->lookup_weight ; x->w_q_lookup[0] = SCALE(1) - x->w_q; for (i = 1; i < x->lookup_depth; i++) x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); if (red_avg_pkt_size < 1) red_avg_pkt_size = 512 ; x->avg_pkt_size = red_avg_pkt_size ; if (red_max_pkt_size < 1) red_max_pkt_size = 1500 ; x->max_pkt_size = red_max_pkt_size ; return 0 ; } static int alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) { if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ int l = pfs->rq_size; if (l == 0) l = dn_hash_size; if (l < 4) l = 4; else if (l > DN_MAX_HASH_SIZE) l = DN_MAX_HASH_SIZE; x->rq_size = l; } else /* one is enough for null mask */ x->rq_size = 1; x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), M_DUMMYNET, M_NOWAIT | M_ZERO); if (x->rq == NULL) { printf("dummynet: sorry, cannot allocate queue\n"); return ENOSPC; } x->rq_elements = 0; return 0 ; } static void set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) { x->flags_fs = src->flags_fs; x->qsize = src->qsize; x->plr = src->plr; x->flow_mask = src->flow_mask; if (x->flags_fs & DN_QSIZE_IS_BYTES) { if (x->qsize > 1024*1024) x->qsize = 1024*1024 ; } else { if (x->qsize == 0) x->qsize = 50 ; if (x->qsize > 100) x->qsize = 50 ; } /* configuring RED */ if ( x->flags_fs & DN_IS_RED ) config_red(src, x) ; /* XXX should check errors */ } /* * setup pipe or queue parameters. */ static int config_pipe(struct dn_pipe *p) { int i, r; struct dn_flow_set *pfs = &(p->fs); struct dn_flow_queue *q; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes */ p->delay = ( p->delay * hz ) / 1000 ; /* We need either a pipe number or a flow_set number */ if (p->pipe_nr == 0 && pfs->fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && pfs->fs_nr != 0) return EINVAL ; if (p->pipe_nr != 0) { /* this is a pipe */ struct dn_pipe *x, *a, *b; DUMMYNET_LOCK(); /* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_NOWAIT | M_ZERO); if (x == NULL) { DUMMYNET_UNLOCK(); printf("dummynet: no memory for new pipe\n"); return ENOSPC; } x->pipe_nr = p->pipe_nr; x->fs.pipe = x ; /* idle_heap is the only one from which we extract from the middle. */ x->idle_heap.size = x->idle_heap.elements = 0 ; x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); } else { x = b; /* Flush accumulated credit for all queues */ for (i = 0; i <= x->fs.rq_size; i++) for (q = x->fs.rq[i]; q; q = q->next) q->numbytes = 0; } x->bandwidth = p->bandwidth ; x->numbytes = 0; /* just in case... */ bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); x->ifp = NULL ; /* reset interface ptr */ x->delay = p->delay ; set_fs_parms(&(x->fs), pfs); if ( x->fs.rq == NULL ) { /* a new pipe */ r = alloc_hash(&(x->fs), pfs) ; if (r) { DUMMYNET_UNLOCK(); free(x, M_DUMMYNET); return r ; } x->next = b ; if (a == NULL) all_pipes = x ; else a->next = x ; } DUMMYNET_UNLOCK(); } else { /* config queue */ struct dn_flow_set *x, *a, *b ; DUMMYNET_LOCK(); /* locate flow_set */ for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; a = b , b = b->next) ; if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ if (pfs->parent_nr == 0) { /* need link to a pipe */ DUMMYNET_UNLOCK(); return EINVAL ; } x = malloc(sizeof(struct dn_flow_set), M_DUMMYNET, M_NOWAIT|M_ZERO); if (x == NULL) { DUMMYNET_UNLOCK(); printf("dummynet: no memory for new flow_set\n"); return ENOSPC; } x->fs_nr = pfs->fs_nr; x->parent_nr = pfs->parent_nr; x->weight = pfs->weight ; if (x->weight == 0) x->weight = 1 ; else if (x->weight > 100) x->weight = 100 ; } else { /* Change parent pipe not allowed; must delete and recreate */ if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) { DUMMYNET_UNLOCK(); return EINVAL ; } x = b; } set_fs_parms(x, pfs); if ( x->rq == NULL ) { /* a new flow_set */ r = alloc_hash(x, pfs) ; if (r) { DUMMYNET_UNLOCK(); free(x, M_DUMMYNET); return r ; } x->next = b; if (a == NULL) all_flow_sets = x; else a->next = x; } DUMMYNET_UNLOCK(); } return 0 ; } /* * Helper function to remove from a heap queues which are linked to * a flow_set about to be deleted. */ static void fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) { int i = 0, found = 0 ; for (; i < h->elements ;) if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (found) heapify(h); } /* * helper function to remove a pipe from a heap (can be there at most once) */ static void pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) { if (h->elements > 0) { int i = 0 ; for (i=0; i < h->elements ; i++ ) { if (h->p[i].object == p) { /* found it */ h->elements-- ; h->p[i] = h->p[h->elements] ; heapify(h); break ; } } } } /* * drain all queues. Called in case of severe mbuf shortage. */ void dummynet_drain() { struct dn_flow_set *fs; struct dn_pipe *p; struct mbuf *m, *mnext; DUMMYNET_LOCK_ASSERT(); heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); /* remove all references to this pipe from flow_sets */ for (fs = all_flow_sets; fs; fs= fs->next ) purge_flow_set(fs, 0); for (p = all_pipes; p; p= p->next ) { purge_flow_set(&(p->fs), 0); mnext = p->head; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; DN_FREE_PKT(m); } p->head = p->tail = NULL ; } } /* * Fully delete a pipe or a queue, cleaning up associated info. */ static int delete_pipe(struct dn_pipe *p) { if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) return EINVAL ; if (p->pipe_nr != 0) { /* this is an old-style pipe */ struct dn_pipe *a, *b; struct dn_flow_set *fs; DUMMYNET_LOCK(); /* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; if (b == NULL || (b->pipe_nr != p->pipe_nr) ) { DUMMYNET_UNLOCK(); return EINVAL ; /* not found */ } /* unlink from list of pipes */ if (a == NULL) all_pipes = b->next ; else a->next = b->next ; /* remove references to this pipe from the ip_fw rules. */ flush_pipe_ptrs(&(b->fs)); /* remove all references to this pipe from flow_sets */ for (fs = all_flow_sets; fs; fs= fs->next ) if (fs->pipe == b) { printf("dummynet: ++ ref to pipe %d from fs %d\n", p->pipe_nr, fs->fs_nr); fs->pipe = NULL ; purge_flow_set(fs, 0); } fs_remove_from_heap(&ready_heap, &(b->fs)); purge_pipe(b); /* remove all data associated to this pipe */ /* remove reference to here from extract_heap and wfq_ready_heap */ pipe_remove_from_heap(&extract_heap, b); pipe_remove_from_heap(&wfq_ready_heap, b); DUMMYNET_UNLOCK(); free(b, M_DUMMYNET); } else { /* this is a WF2Q queue (dn_flow_set) */ struct dn_flow_set *a, *b; DUMMYNET_LOCK(); /* locate set */ for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; a = b , b = b->next) ; if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) { DUMMYNET_UNLOCK(); return EINVAL ; /* not found */ } if (a == NULL) all_flow_sets = b->next ; else a->next = b->next ; /* remove references to this flow_set from the ip_fw rules. */ flush_pipe_ptrs(b); if (b->pipe != NULL) { /* Update total weight on parent pipe and cleanup parent heaps */ b->pipe->sum -= b->weight * b->backlogged ; fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); fs_remove_from_heap(&(b->pipe->scheduler_heap), b); #if 1 /* XXX should i remove from idle_heap as well ? */ fs_remove_from_heap(&(b->pipe->idle_heap), b); #endif } purge_flow_set(b, 1); DUMMYNET_UNLOCK(); } return 0 ; } /* * helper function used to copy data from kernel in DUMMYNET_GET */ static char * dn_copy_set(struct dn_flow_set *set, char *bp) { int i, copied = 0 ; struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; DUMMYNET_LOCK_ASSERT(); for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) printf("dummynet: ++ at %d: wrong slot (have %d, " "should be %d)\n", copied, q->hash_slot, i); if (q->fs != set) printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", i, q->fs, set); copied++ ; bcopy(q, qp, sizeof( *q ) ); /* cleanup pointers */ qp->next = NULL ; qp->head = qp->tail = NULL ; qp->fs = NULL ; } if (copied != set->rq_elements) printf("dummynet: ++ wrong count, have %d should be %d\n", copied, set->rq_elements); return (char *)qp ; } static size_t dn_calc_size(void) { struct dn_flow_set *set ; struct dn_pipe *p ; size_t size ; DUMMYNET_LOCK_ASSERT(); /* * compute size of data structures: list of pipes and flow_sets. */ for (p = all_pipes, size = 0 ; p ; p = p->next ) size += sizeof( *p ) + p->fs.rq_elements * sizeof(struct dn_flow_queue); for (set = all_flow_sets ; set ; set = set->next ) size += sizeof ( *set ) + set->rq_elements * sizeof(struct dn_flow_queue); return size ; } static int dummynet_get(struct sockopt *sopt) { char *buf, *bp ; /* bp is the "copy-pointer" */ size_t size ; struct dn_flow_set *set ; struct dn_pipe *p ; int error=0, i ; /* XXX lock held too long */ DUMMYNET_LOCK(); /* * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we * cannot use this flag while holding a mutex. */ for (i = 0; i < 10; i++) { size = dn_calc_size(); DUMMYNET_UNLOCK(); buf = malloc(size, M_TEMP, M_WAITOK); DUMMYNET_LOCK(); if (size == dn_calc_size()) break; free(buf, M_TEMP); buf = NULL; } if (buf == NULL) { DUMMYNET_UNLOCK(); return ENOBUFS ; } for (p = all_pipes, bp = buf ; p ; p = p->next ) { struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; /* * copy pipe descriptor into *bp, convert delay back to ms, * then copy the flow_set descriptor(s) one at a time. * After each flow_set, copy the queue descriptor it owns. */ bcopy(p, bp, sizeof( *p ) ); pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; /* * XXX the following is a hack based on ->next being the * first field in dn_pipe and dn_flow_set. The correct * solution would be to move the dn_flow_set to the beginning * of struct dn_pipe. */ pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; /* clean pointers */ pipe_bp->head = pipe_bp->tail = NULL ; pipe_bp->fs.next = NULL ; pipe_bp->fs.pipe = NULL ; pipe_bp->fs.rq = NULL ; bp += sizeof( *p ) ; bp = dn_copy_set( &(p->fs), bp ); } for (set = all_flow_sets ; set ; set = set->next ) { struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; bcopy(set, bp, sizeof( *set ) ); /* XXX same hack as above */ fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; fs_bp->pipe = NULL ; fs_bp->rq = NULL ; bp += sizeof( *set ) ; bp = dn_copy_set( set, bp ); } DUMMYNET_UNLOCK(); error = sooptcopyout(sopt, buf, size); free(buf, M_TEMP); return error ; } /* * Handler for the various dummynet socket options (get, flush, config, del) */ static int ip_dn_ctl(struct sockopt *sopt) { int error = 0 ; struct dn_pipe *p, tmp_pipe; /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET) { #if __FreeBSD_version >= 500034 error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); #else if (securelevel >= 3) return (EPERM); #endif } switch (sopt->sopt_name) { default : printf("dummynet: -- unknown option %d", sopt->sopt_name); return EINVAL ; case IP_DUMMYNET_GET : error = dummynet_get(sopt); break ; case IP_DUMMYNET_FLUSH : dummynet_flush() ; break ; case IP_DUMMYNET_CONFIGURE : p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = config_pipe(p); break ; case IP_DUMMYNET_DEL : /* remove a pipe or queue */ p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = delete_pipe(p); break ; } return error ; } static void ip_dn_init(void) { if (bootverbose) printf("DUMMYNET with IPv6 initialized (040826)\n"); DUMMYNET_LOCK_INIT(); all_pipes = NULL ; all_flow_sets = NULL ; ready_heap.size = ready_heap.elements = 0 ; ready_heap.offset = 0 ; wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; wfq_ready_heap.offset = 0 ; extract_heap.size = extract_heap.elements = 0 ; extract_heap.offset = 0 ; ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; ip_dn_ruledel_ptr = dn_rule_delete; callout_init(&dn_timeout, NET_CALLOUT_MPSAFE); callout_reset(&dn_timeout, 1, dummynet, NULL); } #ifdef KLD_MODULE static void ip_dn_destroy(void) { ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; ip_dn_ruledel_ptr = NULL; callout_stop(&dn_timeout); dummynet_flush(); DUMMYNET_LOCK_DESTROY(); } #endif /* KLD_MODULE */ static int dummynet_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if (DUMMYNET_LOADED) { printf("DUMMYNET already loaded\n"); return EEXIST ; } ip_dn_init(); break; case MOD_UNLOAD: #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(); #endif break ; default: return EOPNOTSUPP; break ; } return 0 ; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL }; DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 1); Index: head/sys/netinet/ip_fw2.c =================================================================== --- head/sys/netinet/ip_fw2.c (revision 150349) +++ head/sys/netinet/ip_fw2.c (revision 150350) @@ -1,4273 +1,4273 @@ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DEB(x) #define DDB(x) x /* * Implement IP packet firewall (new version) */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ip6fw.h" #include "opt_ipdn.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif #include #include #ifdef INET6 #include #endif #include /* XXX for ETHERTYPE_IP */ #include /* XXX for in_cksum */ /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 /* * Data structure to cache our ucred related * information. This structure only gets used if * the user specified UID/GID based constraints in * a firewall rule. */ struct ip_fw_ugid { gid_t fw_groups[NGROUPS]; int fw_ngroups; uid_t fw_uid; int fw_prid; }; struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ struct ip_fw *reap; /* list of rules to reap */ struct mtx mtx; /* lock guarding rule list */ int busy_count; /* busy count for rw locks */ int want_write; struct cv cv; }; #define IPFW_LOCK_INIT(_chain) \ mtx_init(&(_chain)->mtx, "IPFW static rules", NULL, \ MTX_DEF | MTX_RECURSE) #define IPFW_LOCK_DESTROY(_chain) mtx_destroy(&(_chain)->mtx) #define IPFW_WLOCK_ASSERT(_chain) do { \ mtx_assert(&(_chain)->mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) static __inline void IPFW_RLOCK(struct ip_fw_chain *chain) { mtx_lock(&chain->mtx); chain->busy_count++; mtx_unlock(&chain->mtx); } static __inline void IPFW_RUNLOCK(struct ip_fw_chain *chain) { mtx_lock(&chain->mtx); chain->busy_count--; if (chain->busy_count == 0 && chain->want_write) cv_signal(&chain->cv); mtx_unlock(&chain->mtx); } static __inline void IPFW_WLOCK(struct ip_fw_chain *chain) { mtx_lock(&chain->mtx); chain->want_write++; while (chain->busy_count > 0) cv_wait(&chain->cv, &chain->mtx); } static __inline void IPFW_WUNLOCK(struct ip_fw_chain *chain) { chain->want_write--; cv_signal(&chain->cv); mtx_unlock(&chain->mtx); } /* * list of rules for layer 3 */ static struct ip_fw_chain layer3_chain; MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; #define IPFW_TABLES_MAX 128 static struct ip_fw_table { struct radix_node_head *rnh; int modified; in_addr_t last_addr; int last_match; u_int32_t last_value; } ipfw_tables[IPFW_TABLES_MAX]; static int fw_debug = 1; static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, "Enable ipfw"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_verbose, 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ static ipfw_dyn_rule **ipfw_dyn_v = NULL; static u_int32_t dyn_buckets = 256; /* must be power of 2 */ static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) /* * Timeouts for various events in handing dynamic rules. */ static u_int32_t dyn_ack_lifetime = 300; static u_int32_t dyn_syn_lifetime = 20; static u_int32_t dyn_fin_lifetime = 1; static u_int32_t dyn_rst_lifetime = 1; static u_int32_t dyn_udp_lifetime = 10; static u_int32_t dyn_short_lifetime = 5; /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static u_int32_t dyn_keepalive_interval = 20; static u_int32_t dyn_keepalive_period = 5; static u_int32_t dyn_keepalive = 1; /* do send keepalives */ static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &dyn_buckets, 0, "Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &dyn_count, 0, "Number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &dyn_max, 0, "Max number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &static_count, 0, "Number of static rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); #ifdef INET6 /* * IPv6 specific variables */ SYSCTL_DECL(_net_inet6_ip6); static struct sysctl_ctx_list ip6_fw_sysctl_ctx; static struct sysctl_oid *ip6_fw_sysctl_tree; #endif /* INET6 */ #endif /* SYSCTL_NODE */ static int fw_deny_unknown_exthdrs = 1; /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { struct ifaddr *ia; /* XXX lock? */ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return(1); /* match */ } } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. The 'versrcreach' * option just checks that the source address is reachable via any route * (except default) in the routing table. These two are a measure to block * forged packets. This is also commonly known as "anti-spoofing" or Unicast * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that syntax is * misleading. The check may be performed on all IP packets whether unicast, * multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp) { struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* if ifp is provided, check for equality with rtentry */ if (ifp != NULL && ro.ro_rt->rt_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &ifnet, if_link) for (mdc2 = mdc->if_addrlist.tqh_first; mdc2; mdc2 = mdc2->ifa_list.tqe_next) { if (!mdc2->ifa_addr) continue; if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) return 1; } } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; rtalloc_ign((struct route *)&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* if ifp is provided, check for equality with rtentry */ if (ifp != NULL && ro.ro_rt->rt_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[0]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[1]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port) ^ (id->flow_id6); return i; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_short offset, u_int hlen) { if (code == ICMP6_UNREACH_RST && offset == 0 && args->f_id.proto == IPPROTO_TCP) { struct ip6_hdr *ip6; struct tcphdr *tcp; tcp_seq ack, seq; int flags; struct { struct ip6_hdr ip6; struct tcphdr th; } ti; if (args->m->m_len < (hlen+sizeof(struct tcphdr))) { args->m = m_pullup(args->m, hlen+sizeof(struct tcphdr)); if (args->m == NULL) return; } ip6 = mtod(args->m, struct ip6_hdr *); tcp = (struct tcphdr *)(mtod(args->m, char *) + hlen); if ((tcp->th_flags & TH_RST) != 0) { m_freem(args->m); return; } ti.ip6 = *ip6; ti.th = *tcp; ti.th.th_seq = ntohl(ti.th.th_seq); ti.th.th_ack = ntohl(ti.th.th_ack); ti.ip6.ip6_nxt = IPPROTO_TCP; if (ti.th.th_flags & TH_ACK) { ack = 0; seq = ti.th.th_ack; flags = TH_RST; } else { ack = ti.th.th_seq; if (((args->m)->m_flags & M_PKTHDR) != 0) { ack += (args->m)->m_pkthdr.len - hlen - (ti.th.th_off << 2); } else if (ip6->ip6_plen) { ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) - hlen - (ti.th.th_off << 2); } else { m_freem(args->m); return; } if (tcp->th_flags & TH_SYN) ack++; seq = 0; flags = TH_RST|TH_ACK; } bcopy(&ti, ip6, sizeof(ti)); tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), args->m, ack, seq, flags); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ icmp6_error(args->m, ICMP6_DST_UNREACH, code, 0); } else m_freem(args->m); args->m = NULL; } #endif /* INET6 */ static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ static void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset) { struct ether_header *eh = args->eh; char *action; int limit_reached = 0; char action2[40], proto[128], fragment[32]; fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (verbose_limit != 0 && norule_counter >= verbose_limit) return; norule_counter++; if (norule_counter == verbose_limit) limit_reached = verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(sa->sa.sin_addr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; char src[48], dst[48]; struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; /* Initialize to make compiler happy. */ struct ip *ip = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 if (args->f_id.addr_type == 6) { snprintf(src, sizeof(src), "[%s]", ip6_sprintf(&args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(&args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)mtod(m, struct ip6_hdr *); tcp = (struct tcphdr *)(mtod(args->m, char *) + hlen); udp = (struct udphdr *)(mtod(args->m, char *) + hlen); } else #endif { ip = mtod(m, struct ip *); tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntoa_r(ip->ip_src, src); inet_ntoa_r(ip->ip_dst, dst); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(mtod(args->m, char *) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (args->f_id.addr_type == 6) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.frag_id6, ntohs(ip6->ip6_plen) - hlen, ntohs(offset & IP6F_OFF_MASK) << 3, (offset & IP6F_MORE_FRAG) ? "+" : ""); } else #endif { int ip_off, ip_len; if (eh != NULL) { /* layer 2 packets are as on the wire */ ip_off = ntohs(ip->ip_off); ip_len = ntohs(ip->ip_len); } else { ip_off = ip->ip_off; ip_len = ip->ip_len; } if (ip_off & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), offset << 3, (ip_off & IP_MF) ? "+" : ""); } } if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (curr_dyn_buckets - 1); return i; } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || dyn_count == 0) return; /* do not expire more than once per second, it is useless */ - if (!FORCE && last_remove == time_second) + if (!FORCE && last_remove == time_uptime) return; - last_remove = time_second; + last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < curr_dyn_buckets ; i++) { for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && - !TIME_LEQ( q->expire, time_second )) + !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } /** * lookup a dynamic rule. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; - if (TIME_LEQ( q->expire, time_second)) { /* expire entry */ + if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && q->dyn_type != O_LIMIT_PARENT) { if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6)) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.dst_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.src_ip6)) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ - q->expire = time_second + dyn_syn_lifetime; + q->expire = time_uptime + dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } - q->expire = time_second + dyn_ack_lifetime; + q->expire = time_uptime + dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (dyn_fin_lifetime >= dyn_keepalive_period) dyn_fin_lifetime = dyn_keepalive_period - 1; - q->expire = time_second + dyn_fin_lifetime; + q->expire = time_uptime + dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (dyn_rst_lifetime >= dyn_keepalive_period) dyn_rst_lifetime = dyn_keepalive_period - 1; - q->expire = time_second + dyn_rst_lifetime; + q->expire = time_uptime + dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { - q->expire = time_second + dyn_udp_lifetime; + q->expire = time_uptime + dyn_udp_lifetime; } else { /* other protocols */ - q->expire = time_second + dyn_short_lifetime; + q->expire = time_uptime + dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } static ipfw_dyn_rule * lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (dyn_buckets > 65536) dyn_buckets = 1024; if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ dyn_buckets = curr_dyn_buckets; /* reset */ return; } curr_dyn_buckets = dyn_buckets; if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); for (;;) { ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) break; curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { realloc_dynamic_table(); if (ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; - r->expire = time_second + dyn_syn_lifetime; + r->expire = time_uptime + dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = r; dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), dyn_count ); ) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { - q->expire = time_second + dyn_short_lifetime; + q->expire = time_uptime + dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ static int install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args) { static int last_log; ipfw_dyn_rule *q; DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n", cmd->o.opcode, (args->f_id.src_ip), (args->f_id.src_port), (args->f_id.dst_ip), (args->f_id.dst_port) );) IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ - if (last_log != time_second) { - last_log = time_second; + if (last_log != time_uptime) { + last_log = time_uptime; printf("ipfw: install_state: entry already present, done\n"); } IPFW_DYN_UNLOCK(); return 0; } if (dyn_count >= dyn_max) /* * Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (dyn_count >= dyn_max) { - if (last_log != time_second) { - last_log = time_second; + if (last_log != time_uptime) { + last_log = time_uptime; printf("ipfw: install_state: Too many dynamic rules\n"); } IPFW_DYN_UNLOCK(); return 1; /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: /* limit number of sessions */ { u_int16_t limit_mask = cmd->limit_mask; struct ipfw_flow_id id; ipfw_dyn_rule *parent; DEB(printf("ipfw: installing dyn-limit rule %d\n", cmd->conn_limit);) id.dst_ip = id.src_ip = 0; id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; parent = lookup_dyn_parent(&id, rule); if (parent == NULL) { printf("ipfw: add parent failed\n"); IPFW_DYN_UNLOCK(); return 1; } if (parent->count >= cmd->conn_limit) { /* * See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= cmd->conn_limit) { - if (fw_verbose && last_log != time_second) { - last_log = time_second; + if (fw_verbose && last_log != time_uptime) { + last_log = time_uptime; log(LOG_SECURITY | LOG_DEBUG, "drop session, too many entries\n"); } IPFW_DYN_UNLOCK(); return 1; } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); } break; default: printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode); IPFW_DYN_UNLOCK(); return 1; } lookup_dyn_rule_locked(&args->f_id, NULL, NULL); /* XXX just set lifetime */ IPFW_DYN_UNLOCK(); return 0; } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ */ static struct mbuf * send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; struct tcphdr *tcp; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == 0) return (NULL); m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); m->m_data += max_linkhdr; ip = mtod(m, struct ip *); bzero(ip, m->m_len); tcp = (struct tcphdr *)(ip + 1); /* no IP options */ ip->ip_p = IPPROTO_TCP; tcp->th_off = 5; /* * Assume we are sending a RST (or a keepalive in the reverse * direction), swap src and destination addresses and ports. */ ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); tcp->th_sport = htons(id->dst_port); tcp->th_dport = htons(id->src_port); if (flags & TH_RST) { /* we are sending a RST */ if (flags & TH_ACK) { tcp->th_seq = htonl(ack); tcp->th_ack = htonl(0); tcp->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; tcp->th_seq = htonl(0); tcp->th_ack = htonl(seq); tcp->th_flags = TH_RST | TH_ACK; } } else { /* * We are sending a keepalive. flags & TH_SYN determines * the direction, forward if set, reverse if clear. * NOTE: seq and ack are always assumed to be correct * as set by the caller. This may be confusing... */ if (flags & TH_SYN) { /* * we have to rewrite the correct addresses! */ ip->ip_dst.s_addr = htonl(id->dst_ip); ip->ip_src.s_addr = htonl(id->src_ip); tcp->th_dport = htons(id->dst_port); tcp->th_sport = htons(id->src_port); } tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; } /* * set ip_len to the payload size so we can compute * the tcp checksum on the pseudoheader * XXX check this, could save a couple of words ? */ ip->ip_len = htons(sizeof(struct tcphdr)); tcp->th_sum = in_cksum(m, m->m_pkthdr.len); /* * now fill fields left out earlier */ ip->ip_ttl = ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); } /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, u_short offset, int ip_len) { if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ if (args->eh != NULL) { struct ip *ip = mtod(args->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = send_pkt(&(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } m_freem(args->m); } else m_freem(args->m); args->m = NULL; } /** * * Given an ip_fw *, lookup_next_rule will return a pointer * to the next rule, which can be either the jump * target (for skipto instructions) or the next one in the list (in * all other cases including a missing jump target). * The result is also written in the "next_rule" field of the rule. * Backward jumps are not allowed, so start looking from the next * rule... * * This never returns NULL -- in case we do not have an exact match, * the next rule is returned. When the ruleset is changed, * pointers are flushed so we are always correct. */ static struct ip_fw * lookup_next_rule(struct ip_fw *me) { struct ip_fw *rule = NULL; ipfw_insn *cmd; /* look for action, in case it is a skipto */ cmd = ACTION_PTR(me); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); if (cmd->opcode == O_ALTQ) cmd += F_LEN(cmd); if ( cmd->opcode == O_SKIPTO ) for (rule = me->next; rule ; rule = rule->next) if (rule->rulenum >= cmd->arg1) break; if (rule == NULL) /* failure or not a skipto */ rule = me->next; me->next_rule = rule; return rule; } static void init_tables(void) { int i; for (i = 0; i < IPFW_TABLES_MAX; i++) { rn_inithead((void **)&ipfw_tables[i].rnh, 32); ipfw_tables[i].modified = 1; } } static int add_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen, u_int32_t value) { struct radix_node_head *rnh; struct table_entry *ent; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ipfw_tables[tbl].rnh; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); if (ent == NULL) return (ENOMEM); ent->value = value; ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; RADIX_NODE_HEAD_LOCK(rnh); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { RADIX_NODE_HEAD_UNLOCK(rnh); free(ent, M_IPFW_TBL); return (EEXIST); } ipfw_tables[tbl].modified = 1; RADIX_NODE_HEAD_UNLOCK(rnh); return (0); } static int del_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa, mask; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ipfw_tables[tbl].rnh; sa.sin_len = mask.sin_len = 8; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; RADIX_NODE_HEAD_LOCK(rnh); ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); if (ent == NULL) { RADIX_NODE_HEAD_UNLOCK(rnh); return (ESRCH); } ipfw_tables[tbl].modified = 1; RADIX_NODE_HEAD_UNLOCK(rnh); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static int flush_table(u_int16_t tbl) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ipfw_tables[tbl].rnh; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, flush_table_entry, rnh); ipfw_tables[tbl].modified = 1; RADIX_NODE_HEAD_UNLOCK(rnh); return (0); } static void flush_tables(void) { u_int16_t tbl; for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) flush_table(tbl); } static int lookup_table(u_int16_t tbl, in_addr_t addr, u_int32_t *val) { struct radix_node_head *rnh; struct ip_fw_table *table; struct table_entry *ent; struct sockaddr_in sa; int last_match; if (tbl >= IPFW_TABLES_MAX) return (0); table = &ipfw_tables[tbl]; rnh = table->rnh; RADIX_NODE_HEAD_LOCK(rnh); if (addr == table->last_addr && !table->modified) { last_match = table->last_match; if (last_match) *val = table->last_value; RADIX_NODE_HEAD_UNLOCK(rnh); return (last_match); } table->modified = 0; sa.sin_len = 8; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); table->last_addr = addr; if (ent != NULL) { table->last_value = *val = ent->value; table->last_match = 1; RADIX_NODE_HEAD_UNLOCK(rnh); return (1); } table->last_match = 0; RADIX_NODE_HEAD_UNLOCK(rnh); return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } static int count_table(u_int32_t tbl, u_int32_t *cnt) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ipfw_tables[tbl].rnh; *cnt = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, count_table_entry, cnt); RADIX_NODE_HEAD_UNLOCK(rnh); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } static int dump_table(ipfw_table *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ipfw_tables[tbl->tbl].rnh; tbl->cnt = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, dump_table_entry, tbl); RADIX_NODE_HEAD_UNLOCK(rnh); return (0); } static void fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp) { struct ucred *cr; if (inp->inp_socket != NULL) { cr = inp->inp_socket->so_cred; ugp->fw_prid = jailed(cr) ? cr->cr_prison->pr_id : -1; ugp->fw_uid = cr->cr_uid; ugp->fw_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, ugp->fw_groups, sizeof(ugp->fw_groups)); } } static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; int match; gid_t *gp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *lookup == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { fill_ugid_cache(inp, ugp); *lookup = 1; } } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*lookup == -1) return (0); if (proto == IPPROTO_TCP) { wildcard = 0; pi = &tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = 1; pi = &udbinfo; } else return 0; match = 0; if (*lookup == 0) { INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb != NULL) { INP_LOCK(pcb); if (pcb->inp_socket != NULL) { fill_ugid_cache(pcb, ugp); *lookup = 1; } INP_UNLOCK(pcb); } INP_INFO_RUNLOCK(pi); if (*lookup == 0) { /* * If the lookup did not yield any results, there * is no sense in coming back and trying again. So * we can set lookup to -1 and ensure that we wont * bother the pcb system again. */ *lookup = -1; return (0); } } if (insn->o.opcode == O_UID) match = (ugp->fw_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) { for (gp = ugp->fw_groups; gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++) if (*gp == (gid_t)insn->d[0]) { match = 1; break; } } else if (insn->o.opcode == O_JAIL) match = (ugp->fw_prid == (int)insn->d[0]); return match; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, or NULL for layer3 packet. * args->oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->cookie a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables hold state during the processing of a packet. * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is simply an alias of the value of m, and it is kept * in sync with it (the packet is supposed to start with * the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ struct ip_fw_ugid fw_ugid_cache; int ugid_lookup = 0; /* * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG * associated with a packet input on a divert socket. This * will allow to distinguish traffic and its direction when * it originates from a divert socket. */ u_int divinput_flags = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, bdg_forward, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset == 0 means there is no Fragment Header. * If offset != 0 for IPv6 always use correct mask to * get the correct offset because we add IP6F_MORE_FRAG * to be able to dectect the first fragment which would * otherwise have offset = 0. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ u_int16_t ip_len=0; int pktlen; /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &layer3_chain; struct m_tag *mtag; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; u_int16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; if (m->m_flags & M_SKIP_FIREWALL) return (IP_FW_PASS); /* accept */ pktlen = m->m_pkthdr.len; proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(len, p, T) \ do { \ int x = (len) + sizeof(T); \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (len)); \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || ntohs(args->eh->ether_type)==ETHERTYPE_IPV6) && mtod(m, struct ip *)->ip_v == 6) { is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = mtod(m, struct ip6_hdr *)->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); args->f_id.flags = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); if (((struct ip6_rthdr *)ulp)->ip6r_type != 0) { printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; /* Add IP6F_MORE_FRAG for offset of first * fragment to be != 0. */ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.frag_id6 = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_ext); /* Packet ends here. if ip6e_len!=0 octets * must be ignored. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } /*switch */ } args->f_id.src_ip6 = mtod(m,struct ip6_hdr *)->ip6_src; args->f_id.dst_ip6 = mtod(m,struct ip6_hdr *)->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(mtod(m, struct ip6_hdr *)->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || ntohs(args->eh->ether_type) == ETHERTYPE_IP) && mtod(m, struct ip *)->ip_v == 4) { is_ipv4 = 1; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; if (args->eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { offset = ip->ip_off & IP_OFFMASK; ip_len = ip->ip_len; } pktlen = ip_len < pktlen ? ip_len : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (args->rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. * * If fw_one_pass != 0 then just accept it. * XXX should not happen here, but optimized out in * the caller. */ if (fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } f = args->rule->next_rule; if (f == NULL) f = lookup_next_rule(args->rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ int skipto = mtag ? divert_cookie(mtag) : 0; f = chain->rules; if (args->eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) { IPFW_RUNLOCK(chain); return (IP_FW_DENY); /* invalid */ } while (f && f->rulenum <= skipto) f = f->next; if (f == NULL) { /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } } } /* reset divert rule to avoid confusion later */ if (mtag) { divinput_flags = divert_info(mtag) & (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG); m_tag_delete(m, mtag); } /* * Now scan the rules, and parse microinstructions for each rule. */ for (; f; f = f->next) { int l, cmdlen; ipfw_insn *cmd; int skip_or; /* skip rest of OR block */ again: if (set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ check_body: cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset!=0) break; if (is_ipv6) /* XXX to be fixed later */ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &fw_ugid_cache, &ugid_lookup, args->inp); break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t t = ntohs(args->eh->ether_type); u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (t>=p[0] && t<=p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: match = (cmd->arg1 & 1 && divinput_flags & IP_FW_DIVERT_LOOPBACK_FLAG) || (cmd->arg1 & 2 && divinput_flags & IP_FW_DIVERT_OUTPUT_FLAG); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v; match = lookup_table(cmd->arg1, a, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); } break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); } break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(mtod(m, struct ip *), cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == mtod(m, struct ip *)->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = ip_len; else if (cmd->opcode == O_IPTTL) x = mtod(m, struct ip *)->ip_ttl; else /* must be IPID */ x = ntohs(mtod(m, struct ip *)->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (mtod(m, struct ip *)->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, mtod(m, struct ip *)->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = ip_len - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == TCP(ulp)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct altq_tag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; mtag = m_tag_find(m, PACKET_TAG_PF_QID, NULL); if (mtag != NULL) break; mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(struct altq_tag), M_NOWAIT); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at = (struct altq_tag *)(mtag+1); at->qid = altq->qid; if (is_ipv4) at->af = AF_INET; else at->af = AF_LINK; at->hdr = ip; m_tag_prepend(m, mtag); break; } case O_LOG: if (fw_verbose) ipfw_log(f, hlen, args, m, oif, offset); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL) : #endif verify_path(src_ip, NULL))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif); else match = 1; break; case O_IPSEC: #ifdef FAST_IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif #ifdef IPSEC match = (ipsec_getnhist(m) != 0); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: if (is_ipv6) { ipfw_insn_ip6 *te = (ipfw_insn_ip6 *)cmd; struct in6_addr p = args->f_id.src_ip6; APPLY_MASK(&p, &te->mask6); match = IN6_ARE_ADDR_EQUAL(&te->addr6, &p); } break; case O_IP6_DST_MASK: if (is_ipv6) { ipfw_insn_ip6 *te = (ipfw_insn_ip6 *)cmd; struct in6_addr p = args->f_id.dst_ip6; APPLY_MASK(&p, &te->mask6); match = IN6_ARE_ADDR_EQUAL(&te->addr6, &p); } break; case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); break; case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to do a 'goto done'). * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * ('goto next_rule', equivalent to a 'break 2'), * or to the SKIPTO target ('goto again' after * having set f, cmd and l), respectively. * * O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet * must be dropped * ('goto done' after setting retval); * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * ('goto check_body') if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found ('goto next_rule'). * The result of the lookup is cached to make * further instances of these opcodes are * effectively NOPs. */ case O_LIMIT: case O_KEEP_STATE: if (install_state(f, (ipfw_insn_limit *)cmd, args)) { retval = IP_FW_DENY; goto done; /* error/limit violation */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule. */ q->pcnt++; q->bcnt += pktlen; f = q->rule; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; IPFW_DYN_UNLOCK(); goto check_body; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) goto next_rule; match = 1; break; case O_ACCEPT: retval = 0; /* accept */ goto done; case O_PIPE: case O_QUEUE: args->rule = f; /* report matching rule */ args->cookie = cmd->arg1; retval = IP_FW_DUMMYNET; goto done; case O_DIVERT: case O_TEE: { struct divert_tag *dt; if (args->eh) /* not on layer 2 */ break; mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT); if (mtag == NULL) { /* XXX statistic */ /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } dt = (struct divert_tag *)(mtag+1); dt->cookie = f->rulenum; dt->info = cmd->arg1; m_tag_prepend(m, mtag); retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; goto done; } case O_COUNT: case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; - f->timestamp = time_second; + f->timestamp = time_uptime; if (cmd->opcode == O_COUNT) goto next_rule; /* handle skipto */ if (f->next_rule == NULL) lookup_next_rule(f); f = f->next_rule; goto again; case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, offset,ip_len); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(args->f_id.flags) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6(args, cmd->arg1, offset, hlen); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; goto done; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) args->next_hop = &((ipfw_insn_sa *)cmd)->sa; retval = IP_FW_PASS; goto done; case O_NETGRAPH: case O_NGTEE: args->rule = f; /* report matching rule */ args->cookie = cmd->arg1; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; goto done; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner for, scan opcodes */ next_rule:; /* try next rule */ } /* end of outer for, scan rules */ printf("ipfw: ouch!, skip past end of rules, denying packet\n"); IPFW_RUNLOCK(chain); return (IP_FW_DENY); done: /* Update statistics */ f->pcnt++; f->bcnt += pktlen; - f->timestamp = time_second; + f->timestamp = time_uptime; IPFW_RUNLOCK(chain); return (retval); pullup_failed: if (fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * When a rule is added/deleted, clear the next_rule pointers in all rules. * These will be reconstructed on the fly as packets are matched. */ static void flush_rule_ptrs(struct ip_fw_chain *chain) { struct ip_fw *rule; IPFW_WLOCK_ASSERT(chain); for (rule = chain->rules; rule; rule = rule->next) rule->next_rule = NULL; } /* * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given * pipe/queue, or to all of them (match == NULL). */ void flush_pipe_ptrs(struct dn_flow_set *match) { struct ip_fw *rule; IPFW_WLOCK(&layer3_chain); for (rule = layer3_chain.rules; rule; rule = rule->next) { ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule); if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE) continue; /* * XXX Use bcmp/bzero to handle pipe_ptr to overcome * possible alignment problems on 64-bit architectures. * This code is seldom used so we do not worry too * much about efficiency. */ if (match == NULL || !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) ) bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr)); } IPFW_WUNLOCK(&layer3_chain); } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. */ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) return (EINVAL); rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO); if (rule == NULL) return (ENOSPC); bcopy(input_rule, rule, l); rule->next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; IPFW_WLOCK(chain); if (chain->rules == NULL) { /* default rule */ chain->rules = rule; goto done; } /* * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ if (autoinc_step < 1) autoinc_step = 1; else if (autoinc_step > 1000) autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default */ for (f = chain->rules; f; f = f->next) { if (f->rulenum == IPFW_DEFAULT_RULE) break; rule->rulenum = f->rulenum; } if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) rule->rulenum += autoinc_step; input_rule->rulenum = rule->rulenum; } /* * Now insert the new rule in the right place in the sorted list. */ for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) { if (f->rulenum > rule->rulenum) { /* found the location */ if (prev) { rule->next = f; prev->next = rule; } else { /* head insert */ rule->next = chain->rules; chain->rules = rule; } break; } } flush_rule_ptrs(chain); done: static_count++; static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", rule->rulenum, static_count);) return (0); } /** * Remove a static rule (including derived * dynamic rules) * and place it on the ``reap list'' for later reclamation. * The caller is in charge of clearing rule pointers to avoid * dangling pointers. * @return a pointer to the next entry. * Arguments are not checked, so they better be correct. */ static struct ip_fw * remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { struct ip_fw *n; int l = RULESIZE(rule); IPFW_WLOCK_ASSERT(chain); n = rule->next; IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); if (prev == NULL) chain->rules = n; else prev->next = n; static_count--; static_len -= l; rule->next = chain->reap; chain->reap = rule; return n; } /** * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. */ static void reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; if (DUMMYNET_LOADED) ip_dn_ruledel_ptr(rule); free(rule, M_IPFW); } } /* * Remove all rules from a chain (except rules in set RESVD_SET * unless kill_default = 1). The caller is responsible for * reclaiming storage for the rules left in chain->reap. */ static void free_chain(struct ip_fw_chain *chain, int kill_default) { struct ip_fw *prev, *rule; IPFW_WLOCK_ASSERT(chain); flush_rule_ptrs(chain); /* more efficient to do outside the loop */ for (prev = NULL, rule = chain->rules; rule ; ) if (kill_default || rule->set != RESVD_SET) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } /** * Remove all rules with given number, and also do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an u_int32_t. The low 16 bit are the rule or set number, * the next 8 bits are the new set, the top 8 bits are the command: * * 0 delete rules with given number * 1 delete rules with given set number * 2 move rules with given number to new set * 3 move rules with given set number to new set * 4 swap sets with given numbers */ static int del_entry(struct ip_fw_chain *chain, u_int32_t arg) { struct ip_fw *prev = NULL, *rule; u_int16_t rulenum; /* rule or old_set */ u_int8_t cmd, new_set; rulenum = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 4) return EINVAL; if (new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2) { if (rulenum >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (rulenum > RESVD_SET) /* old_set */ return EINVAL; } IPFW_WLOCK(chain); rule = chain->rules; chain->reap = NULL; switch (cmd) { case 0: /* delete rules with given number */ /* * locate first rule to delete */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return EINVAL; } /* * flush pointers outside the loop, then delete all matching * rules. prev remains the same throughout the cycle. */ flush_rule_ptrs(chain); while (rule->rulenum == rulenum) rule = remove_rule(chain, rule, prev); break; case 1: /* delete all rules with given set number */ flush_rule_ptrs(chain); rule = chain->rules; while (rule->rulenum < IPFW_DEFAULT_RULE) if (rule->set == rulenum) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } break; case 2: /* move rules with given number to new set */ rule = chain->rules; for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->rulenum == rulenum) rule->set = new_set; break; case 3: /* move rules with given set number to new set */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; break; case 4: /* swap two sets */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; else if (rule->set == new_set) rule->set = rulenum; break; } /* * Look for rules to reclaim. We grab the list before * releasing the lock then reclaim them w/o the lock to * avoid a LOR with dummynet. */ rule = chain->reap; chain->reap = NULL; IPFW_WUNLOCK(chain); if (rule) reap_rules(rule); return 0; } /* * Clear counters for a specific rule. * The enclosing "table" is assumed locked. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * @arg frwl is null to clear all entries, or contains a specific * rule number. * @arg log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, int rulenum, int log_only) { struct ip_fw *rule; char *msg; IPFW_WLOCK(chain); if (rulenum == 0) { norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) clear_counters(rule, log_only); msg = log_only ? "ipfw: All logging counts reset.\n" : "ipfw: Accounting cleared.\n"; } else { int cleared = 0; /* * We can have multiple rules with the same number, so we * need to clear them all. */ for (rule = chain->rules; rule; rule = rule->next) if (rule->rulenum == rulenum) { while (rule && rule->rulenum == rulenum) { clear_counters(rule, log_only); rule = rule->next; } cleared = 1; break; } if (!cleared) { /* we did not find any matching rules */ IPFW_WUNLOCK(chain); return (EINVAL); } msg = log_only ? "ipfw: Entry %d logging count reset.\n" : "ipfw: Entry %d cleared.\n"; } IPFW_WUNLOCK(chain); if (fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } /* * Check validity of the structure before insert. * Fortunately rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } DEB(printf("ipfw: opcode %d\n", cmd->opcode);) switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (!NG_IPFW_LOADED) return EINVAL; else goto check_size; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule; int i; /* XXX this can take a long time and locking will block packet flow */ IPFW_RLOCK(chain); for (rule = chain->rules; rule ; rule = rule->next) { /* * Verify the entry fits in the buffer in case the * rules changed between calculating buffer space and * now. This would be better done using a generation * number but should suffice for now. */ i = RULESIZE(rule); if (bp + i <= ep) { bcopy(rule, bp, i); bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), sizeof(set_disable)); bp += i; } } IPFW_RUNLOCK(chain); if (ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets; i++) for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = - TIME_LEQ(dst->expire, time_second) ? - 0 : dst->expire - time_second ; + TIME_LEQ(dst->expire, time_uptime) ? + 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); } return (bp - (char *)buf); } /** * {set|get}sockopt parser. */ static int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error, rule_num; size_t size; struct ip_fw *buf, *rule; u_int32_t rulenum[2]; error = suser(sopt->sopt_td); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { #if __FreeBSD_version >= 500034 error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); #else /* FreeBSD 4.x */ if (securelevel >= 3) return (EPERM); #endif } error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ size = static_len; /* size of static rules */ if (ipfw_dyn_v) /* add size of dyn.rules */ size += (dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know * how much room is needed, do not bother filling up the * buffer, just jump to the sooptcopyout. */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, ipfw_getrules(&layer3_chain, buf, size)); free(buf, M_TEMP); break; case IP_FW_FLUSH: /* * Normally we cannot release the lock on each iteration. * We could do it here only because we start from the head all * the times so there is no risk of missing some entries. * On the other hand, the risk is that we end up with * a very inconsistent ruleset, so better keep the lock * around the whole cycle. * * XXX this code can be improved by resetting the head of * the list to point to the default rule, and then freeing * the old list without the need for a lock. */ IPFW_WLOCK(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 0 /* keep default rule */); rule = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (layer3_chain.reap != NULL) reap_rules(rule); break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw) ); if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { error = add_rule(&layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ error = del_entry(&layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ set_disable = (set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, &rule_num, sizeof(int), sizeof(int)); if (error) break; } error = zero_entry(&layer3_chain, rule_num, sopt->sopt_name == IP_FW_RESETLOG); break; case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = add_table_entry(ent.tbl, ent.addr, ent.masklen, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = del_table_entry(ent.tbl, ent.addr, ent.masklen); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; error = flush_table(tbl); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; if ((error = count_table(tbl, &cnt))) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); if (tbl == NULL) { error = ENOMEM; break; } error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); error = dump_table(tbl); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } /** * dummynet needs a reference to the default rule, because rules can be * deleted while packets hold a reference to them. When this happens, * dummynet changes the reference to the default rule (it could well be a * NULL pointer, but this way we do not need to check for the special * case, plus here he have info on the default behaviour). */ struct ip_fw *ip_fw_default_rule; /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * __unused unused) { struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets ; i++) { for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; - if (TIME_LEQ( time_second+dyn_keepalive_interval, + if (TIME_LEQ( time_uptime+dyn_keepalive_interval, q->expire)) continue; /* too early */ - if (TIME_LEQ(q->expire, time_second)) + if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ *mtailp = send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; *mtailp = send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; } } IPFW_DYN_UNLOCK(); for (m = mnext = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } done: callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); } int ipfw_init(void) { struct ip_fw default_rule; int error; #ifdef INET6 /* Setup IPv6 fw sysctl tree. */ sysctl_ctx_init(&ip6_fw_sysctl_ctx); ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, &fw_deny_unknown_exthdrs, 0, "Deny packets with unknown IPv6 Extension Headers"); #endif layer3_chain.rules = NULL; layer3_chain.want_write = 0; layer3_chain.busy_count = 0; cv_init(&layer3_chain.cv, "Condition variable for IPFW rw locks"); IPFW_LOCK_INIT(&layer3_chain); ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule zone", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); callout_init(&ipfw_timeout, NET_CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = RESVD_SET; default_rule.cmd[0].len = 1; default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 1 ? O_ACCEPT : #endif O_DENY; error = add_rule(&layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); return (error); } ip_fw_default_rule = layer3_chain.rules; printf("ipfw2 (+ipv6) initialized, divert %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); #ifdef IPFIREWALL_VERBOSE fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif if (fw_verbose == 0) printf("disabled\n"); else if (verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", verbose_limit); init_tables(); ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); return (0); } void ipfw_destroy(void) { struct ip_fw *reap; ip_fw_chk_ptr = NULL; ip_fw_ctl_ptr = NULL; callout_drain(&ipfw_timeout); IPFW_WLOCK(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 1 /* kill default rule */); reap = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (reap != NULL) reap_rules(reap); flush_tables(); IPFW_DYN_LOCK_DESTROY(); uma_zdestroy(ipfw_dyn_rule_zone); IPFW_LOCK_DESTROY(&layer3_chain); #ifdef INET6 /* Free IPv6 fw sysctl tree. */ sysctl_ctx_free(&ip6_fw_sysctl_ctx); #endif printf("IP firewall unloaded\n"); } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 150349) +++ head/sys/netinet/ip_mroute.c (revision 150350) @@ -1,3469 +1,3469 @@ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling * * $FreeBSD$ */ #include "opt_mac.h" #include "opt_mrouting.h" #ifdef PIM #define _PIM_VT 1 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PIM #include #include #endif #include #include /* * Control debugging code for rsvp and multicast routing code. * Can only set them with the debugger. */ static u_int rsvpdebug; /* non-zero enables debugging */ static u_int mrtdebug; /* any set of the flags below */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_PIM 0x20 #define VIFI_INVALID ((vifi_t) -1) #define M_HASCL(m) ((m)->m_flags & M_EXT) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. It may be better to add more fine-grained locking later; * it's not clear how performance-critical this code is. */ static struct mrtstat mrtstat; SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); static struct mfc *mfctable[MFCTBLSIZ]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() do { \ mtx_assert(&mfc_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static struct vif viftable[MAXVIFS]; SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, &viftable, sizeof(viftable), "S,vif[MAXVIFS]", "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static u_char nexpire[MFCTBLSIZ]; static struct callout expire_upcalls_ch; #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Define the token bucket filter structures * tbftable -> each vif has one of these for storing info */ static struct tbf tbftable[MAXVIFS]; #define TBF_REPROCESS (hz / 100) /* 100x / second */ /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). These interfaces are never linked into * the system ifnet list & no routes point to them. I.e., packets * can't be sent this way. They only exist as a placeholder for * multicast source verification. */ static struct ifnet multicast_decap_if[MAXVIFS]; #define ENCAP_TTL 64 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */ /* prototype IP hdr for encapsulated packets */ static struct ip multicast_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, ENCAP_PROTO, 0, /* checksum */ }; /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; static struct callout bw_meter_ch; #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; static u_int bw_upcalls_n; /* # of pending upcalls */ static struct callout bw_upcalls_ch; #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ #ifdef PIM static struct pimstat pimstat; SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, &pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static struct ifnet multicast_register_if; static vifi_t reg_vif_num = VIFI_INVALID; #endif /* PIM */ /* * Private variables. */ static vifi_t numvifs; static const struct encaptab *encap_cookie; /* * one-back cache used by mroute_encapcheck to locate a tunnel's vif * given a datagram's src ip address. */ static u_long last_encap_src; static struct vif *last_encap_vif; /* * Callout for queue processing. */ static struct callout tbf_reprocess_ch; static u_long X_ip_mcast_src(int vifi); static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); static int X_mrt_ioctl(int cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static int ip_mrouter_init(struct socket *, int); static int add_vif(struct vifctl *); static int del_vif(vifi_t); static int add_mfc(struct mfcctl2 *); static int del_mfc(struct mfcctl2 *); static int set_api_config(uint32_t *); /* chose API capabilities */ static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static int set_assert(int); static void expire_upcalls(void *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static void encap_send(struct ip *, struct vif *, struct mbuf *); static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); static void tbf_queue(struct vif *, struct mbuf *); static void tbf_process_q(struct vif *); static void tbf_reprocess_q(void *); static int tbf_dq_sel(struct vif *, struct ip *); static void tbf_send_packet(struct vif *, struct mbuf *); static void tbf_update_tokens(struct vif *); static int priority(struct vif *, struct ip *); /* * Bandwidth monitoring */ static void free_bw_list(struct bw_meter *list); static int add_bw_upcall(struct bw_upcall *); static int del_bw_upcall(struct bw_upcall *); static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp); static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); static void bw_upcalls_send(void); static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); static void unschedule_bw_meter(struct bw_meter *x); static void bw_meter_process(void); static void expire_bw_upcalls_send(void *); static void expire_bw_meter_process(void *); #ifdef PIM static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); #endif /* * whether or not special PIM assert processing is enabled. */ static int pim_assert; /* * Rate limit for assert notification messages, in usec */ #define ASSERT_MSG_TIME 3000000 /* * Kernel multicast routing API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static uint32_t mrt_api_config = 0; /* * Hash function for a source, group entry */ #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ ((g) >> 20) ^ ((g) >> 10) ^ (g)) /* * Find a route for a given origin IP address and Multicast group address * Type of service parameter to be added in the future!!! * Statistics are updated by the caller if needed * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) */ static struct mfc * mfc_find(in_addr_t o, in_addr_t g) { struct mfc *rt; MFC_LOCK_ASSERT(); for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) if ((rt->mfc_origin.s_addr == o) && (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) break; return rt; } /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) { \ int xxs; \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; static int version = 0x0305; /* !!! why is this here? XXX */ switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &version, sizeof version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(int cmd, caddr_t data) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = suser(curthread); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(req->src.s_addr, req->grp.s_addr); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = viftable[vifi].v_pkt_in; req->ocount = viftable[vifi].v_pkt_out; req->ibytes = viftable[vifi].v_bytes_in; req->obytes = viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void ip_mrouter_reset(void) { bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); pim_assert = 0; mrt_api_config = 0; callout_init(&expire_upcalls_ch, NET_CALLOUT_MPSAFE); bw_upcalls_n = 0; bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); callout_init(&bw_upcalls_ch, NET_CALLOUT_MPSAFE); callout_init(&bw_meter_ch, NET_CALLOUT_MPSAFE); callout_init(&tbf_reprocess_ch, NET_CALLOUT_MPSAFE); } static struct mtx mrouter_mtx; /* used to synch init/done work */ /* * Enable multicast routing */ static int ip_mrouter_init(struct socket *so, int version) { if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; mtx_lock(&mrouter_mtx); if (ip_mrouter != NULL) { mtx_unlock(&mrouter_mtx); return EADDRINUSE; } callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); ip_mrouter = so; mtx_unlock(&mrouter_mtx); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init\n"); return 0; } /* * Disable multicast routing */ static int X_ip_mrouter_done(void) { vifi_t vifi; int i; struct ifnet *ifp; struct ifreq ifr; struct mfc *rt; struct rtdetq *rte; mtx_lock(&mrouter_mtx); if (ip_mrouter == NULL) { mtx_unlock(&mrouter_mtx); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ ip_mrouter = NULL; mrt_api_config = 0; VIF_LOCK(); if (encap_cookie) { const struct encaptab *c = encap_cookie; encap_cookie = NULL; encap_detach(c); } VIF_UNLOCK(); callout_stop(&tbf_reprocess_ch); VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_lcl_addr.s_addr != 0 && !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); so->sin_len = sizeof(struct sockaddr_in); so->sin_family = AF_INET; so->sin_addr.s_addr = INADDR_ANY; ifp = viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)tbftable, sizeof(tbftable)); bzero((caddr_t)viftable, sizeof(viftable)); numvifs = 0; pim_assert = 0; VIF_UNLOCK(); /* * Free all multicast forwarding cache entries. */ callout_stop(&expire_upcalls_ch); callout_stop(&bw_upcalls_ch); callout_stop(&bw_meter_ch); MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { for (rt = mfctable[i]; rt != NULL; ) { struct mfc *nr = rt->mfc_next; for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } free_bw_list(rt->mfc_bw_meter); free(rt, M_MRTABLE); rt = nr; } } bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); bw_upcalls_n = 0; bzero(bw_meter_timers, sizeof(bw_meter_timers)); MFC_UNLOCK(); /* * Reset de-encapsulation cache */ last_encap_src = INADDR_ANY; last_encap_vif = NULL; #ifdef PIM reg_vif_num = VIFI_INVALID; #endif mtx_unlock(&mrouter_mtx); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_done\n"); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; pim_assert = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { int i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (numvifs > 0) { *apival = 0; return EPERM; } if (pim_assert) { *apival = 0; return EPERM; } for (i = 0; i < MFCTBLSIZ; i++) { if (mfctable[i] != NULL) { *apival = 0; return EPERM; } } mrt_api_config = *apival & mrt_api_support; *apival = mrt_api_config; return 0; } /* * Decide if a packet is from a tunnelled peer. * Return 0 if not, 64 if so. XXX yuck.. 64 ??? */ static int mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; /* * don't claim the packet if it's not to a multicast destination or if * we don't have an encapsulating tunnel with the source. * Note: This code assumes that the remote site IP address * uniquely identifies the tunnel (i.e., that this site has * at most one tunnel with the remote site). */ if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) return 0; if (ip->ip_src.s_addr != last_encap_src) { struct vif *vifp = viftable; struct vif *vife = vifp + numvifs; last_encap_src = ip->ip_src.s_addr; last_encap_vif = NULL; for ( ; vifp < vife; ++vifp) if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) last_encap_vif = vifp; break; } } if (last_encap_vif == NULL) { last_encap_src = INADDR_ANY; return 0; } return 64; } /* * De-encapsulate a packet and feed it back through ip input (this * routine is called whenever IP gets a packet that mroute_encap_func() * claimed). */ static void mroute_encap_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; if (hlen > sizeof(struct ip)) ip_stripoptions(m, (struct mbuf *) 0); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); m->m_pkthdr.len -= sizeof(struct ip); m->m_pkthdr.rcvif = last_encap_vif->v_ifp; netisr_queue(NETISR_IP, m); /* mbuf is free'd on failure. */ /* * normally we would need a "schednetisr(NETISR_IP)" * here but we were called by ip_input and it is going * to loop back & try to dequeue the packet we just * queued as soon as we return so we avoid the * unnecessary software interrrupt. * * XXX * This no longer holds - we may have direct-dispatched the packet, * or there may be a queue processing limit. */ } extern struct domain inetdomain; static struct protosw mroute_encap_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, mroute_encap_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { VIF_UNLOCK(); return EADDRINUSE; } if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ #ifdef PIM if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else #endif { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; } if (vifcp->vifc_flags & VIFF_TUNNEL) { if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { /* * An encapsulating tunnel is wanted. Tell * mroute_encap_input() to start paying attention * to encapsulated packets. */ if (encap_cookie == NULL) { int i; encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, mroute_encapcheck, (struct protosw *)&mroute_encap_protosw, NULL); if (encap_cookie == NULL) { printf("ip_mroute: unable to attach encap\n"); VIF_UNLOCK(); return EIO; /* XXX */ } for (i = 0; i < MAXVIFS; ++i) { if_initname(&multicast_decap_if[i], "mdecap", i); } } /* * Set interface to fake encapsulator interface */ ifp = &multicast_decap_if[vifcp->vifc_vifi]; /* * Prepare cached route entry */ bzero(&vifp->v_route, sizeof(vifp->v_route)); } else { log(LOG_ERR, "source routed tunnels not supported\n"); VIF_UNLOCK(); return EOPNOTSUPP; } #ifdef PIM } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &multicast_register_if; if (mrtdebug) log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", (void *)&multicast_register_if); if (reg_vif_num == VIFI_INVALID) { if_initname(&multicast_register_if, "register_vif", 0); multicast_register_if.if_flags = IFF_LOOPBACK; bzero(&vifp->v_route, sizeof(vifp->v_route)); reg_vif_num = vifcp->vifc_vifi; } #endif } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } /* define parameters for the tbf structure */ vifp->v_tbf = v_tbf; GET_TIME(vifp->v_tbf->tbf_last_pkt_t); vifp->v_tbf->tbf_n_tok = 0; vifp->v_tbf->tbf_q_len = 0; vifp->v_tbf->tbf_max_q_len = MAXQSIZE; vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* scaling up here allows division by 1024 in critical code */ vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; vifp->v_rsvp_on = 0; vifp->v_rsvpd = NULL; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); if (mrtdebug) log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", vifcp->vifc_vifi, (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), vifcp->vifc_threshold, vifcp->vifc_rate_limit); return 0; } /* * Delete a vif from the vif table */ static int del_vif(vifi_t vifi) { struct vif *vifp; VIF_LOCK(); if (vifi >= numvifs) { VIF_UNLOCK(); return EINVAL; } vifp = &viftable[vifi]; if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { VIF_UNLOCK(); return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp == last_encap_vif) { last_encap_vif = NULL; last_encap_src = INADDR_ANY; } /* * Free packets queued at the interface */ while (vifp->v_tbf->tbf_q) { struct mbuf *m = vifp->v_tbf->tbf_q; vifp->v_tbf->tbf_q = m->m_act; m_freem(m); } #ifdef PIM if (vifp->v_flags & VIFF_REGISTER) reg_vif_num = VIFI_INVALID; #endif bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); bzero((caddr_t)vifp, sizeof (*vifp)); if (mrtdebug) log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); /* Adjust numvifs down */ for (vifi = numvifs; vifi > 0; vifi--) if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) break; numvifs = vifi; VIF_UNLOCK(); return 0; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; u_long hash; struct rtdetq *rte; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); /* If an entry already exists, just update the fields */ if (rt) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Find the entry for which the upcall was made and update */ hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", "multiple kernel entries", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ nexpire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } rt->mfc_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } init_mfc_params(rt, mfccp); rt->mfc_expire = 0; rt->mfc_stall = NULL; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; } } MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; struct mfc **nptr; u_long hash; struct bw_meter *list; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) if (origin.s_addr == rt->mfc_origin.s_addr && mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && rt->mfc_stall == NULL) break; if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } *nptr = rt->mfc_next; /* * free the bw_meter entries */ list = rt->mfc_bw_meter; rt->mfc_bw_meter = NULL; free(rt, M_MRTABLE); free_bw_list(list); MFC_UNLOCK(); return 0; } /* * Send a message to mrouted on the multicast routing socket */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), (void *)ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ static int last_log; - if (last_log != time_second) { - last_log = time_second; + if (last_log != time_uptime) { + last_log = time_uptime; log(LOG_ERR, "ip_mforward: received source-routed packet from %lx\n", (u_long)ntohl(ip->ip_src.s_addr)); } return 1; } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { if (ip->ip_ttl < 255) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { struct vif *vifp = viftable + vifi; printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), vifi, (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", vifp->v_ifp->if_xname); } error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); if (!imo) printf("In fact, no options were specified at all\n"); } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ ++mrtstat.mrts_mfc_lookups; rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; ++mrtstat.mrts_mfc_misses; mrtstat.mrts_no_route++; if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_DONTWAIT); if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copy(mb0, 0, hlen); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; nexpire[hash]++; for (i = 0; i < numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ rt->mfc_bw_meter = NULL; /* link into table */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; rt->mfc_stall = rte; } else { /* determine if q has overflowed */ int npkts = 0; struct rtdetq **p; /* * XXX ouch! we need to append to the list, but we * only have a pointer to the front, so we have to * scan the entire list every time. */ for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) npkts++; if (npkts > MAX_UPQ) { mrtstat.mrts_upq_ovflw++; non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* Add this entry to the end of the queue */ *p = rte; } rte->m = mb0; rte->ifp = ifp; rte->next = NULL; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mfc *mfc, **nptr; int i; MFC_LOCK(); for (i = 0; i < MFCTBLSIZ; i++) { if (nexpire[i] == 0) continue; nptr = &mfctable[i]; for (mfc = *nptr; mfc != NULL; mfc = *nptr) { /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && --mfc->mfc_expire == 0) { if (mrtdebug & DEBUG_EXPIRE) log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", (u_long)ntohl(mfc->mfc_origin.s_addr), (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ for (rte = mfc->mfc_stall; rte; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } ++mrtstat.mrts_cache_cleanups; nexpire[i]--; /* * free the bw_meter entries */ while (mfc->mfc_bw_meter != NULL) { struct bw_meter *x = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } *nptr = mfc->mfc_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mfc_next; } } } MFC_UNLOCK(); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ip->ip_len; VIF_LOCK_ASSERT(); /* * Macro to send packet on vif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC_SEND(ip,vifp,m) { \ if ((vifp)->v_flags & VIFF_TUNNEL) \ encap_send((ip), (vifp), (m)); \ else \ phyint_send((ip), (vifp), (m)); \ } /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < numvifs) { #ifdef PIM if (viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + xmt_vif, m, rt); else #endif MC_SEND(ip, viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { /* came in the wrong interface */ if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); ++mrtstat.mrts_wrong_if; ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { struct timeval now; u_long delta; #ifdef PIM if (ifp == &multicast_register_if) pimstat.pims_rcv_registers_wrongiif++; #endif /* Get vifi for the incoming packet */ for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ GET_TIME(now); TV_DELTA(rt->mfc_last_assert, now, delta); if (delta > ASSERT_MSG_TIME) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copy(m, 0, hlen); if (mm && (M_HASCL(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; rt->mfc_last_assert = now; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = im->im_src; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; } else { viftable[vifi].v_pkt_in++; viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; #ifdef PIM if (viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, viftable + vifi, m, rt); else #endif MC_SEND(ip, viftable+vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; GET_TIME(now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * check if a vif number is legal/ok. This is used by ip_output. */ static int X_legal_vif_num(int vif) { /* XXX unlocked, matter? */ return (vif >= 0 && vif < numvifs); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { /* XXX unlocked, matter? */ if (vifi >= 0 && vifi < numvifs) return viftable[vifi].v_lcl_addr.s_addr; else return INADDR_ANY; } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); } static void encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; struct ip *ip_copy; int i, len = ip->ip_len; VIF_LOCK_ASSERT(); /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * copy the old packet & pullup its IP header into the * new mbuf so we can modify it. Try to fill the new * mbuf since if we don't the ethernet driver will. */ MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); if (mb_copy == NULL) return; #ifdef MAC mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy); #endif mb_copy->m_data += max_linkhdr; mb_copy->m_len = sizeof(multicast_encap_iphdr); if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { m_freem(mb_copy); return; } i = MHLEN - M_LEADINGSPACE(mb_copy); if (i > len) i = len; mb_copy = m_pullup(mb_copy, i); if (mb_copy == NULL) return; mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); /* * fill in the encapsulating IP header. */ ip_copy = mtod(mb_copy, struct ip *); *ip_copy = multicast_encap_iphdr; ip_copy->ip_id = ip_newid(); ip_copy->ip_len += len; ip_copy->ip_src = vifp->v_lcl_addr; ip_copy->ip_dst = vifp->v_rmt_addr; /* * turn the encapsulated IP header back into a valid one. */ ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); --ip->ip_ttl; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; mb_copy->m_data += sizeof(multicast_encap_iphdr); ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); mb_copy->m_data -= sizeof(multicast_encap_iphdr); if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); } /* * Token bucket filter module */ static void tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ mrtstat.mrts_pkt2large++; m_freem(m); return; } tbf_update_tokens(vifp); if (t->tbf_q_len == 0) { /* queue empty... */ if (p_len <= t->tbf_n_tok) { /* send packet if enough tokens */ t->tbf_n_tok -= p_len; tbf_send_packet(vifp, m); } else { /* no, queue packet and try later */ tbf_queue(vifp, m); callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); } } else if (t->tbf_q_len < t->tbf_max_q_len) { /* finite queue length, so queue pkts and process queue */ tbf_queue(vifp, m); tbf_process_q(vifp); } else { /* queue full, try to dq and queue and process */ if (!tbf_dq_sel(vifp, ip)) { mrtstat.mrts_q_overflow++; m_freem(m); } else { tbf_queue(vifp, m); tbf_process_q(vifp); } } } /* * adds a packet to the queue at the interface */ static void tbf_queue(struct vif *vifp, struct mbuf *m) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); if (t->tbf_t == NULL) /* Queue was empty */ t->tbf_q = m; else /* Insert at tail */ t->tbf_t->m_act = m; t->tbf_t = m; /* Set new tail pointer */ #ifdef DIAGNOSTIC /* Make sure we didn't get fed a bogus mbuf */ if (m->m_act) panic("tbf_queue: m_act"); #endif m->m_act = NULL; t->tbf_q_len++; } /* * processes the queue at the interface */ static void tbf_process_q(struct vif *vifp) { struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); /* loop through the queue at the interface and send as many packets * as possible */ while (t->tbf_q_len > 0) { struct mbuf *m = t->tbf_q; int len = mtod(m, struct ip *)->ip_len; /* determine if the packet can be sent */ if (len > t->tbf_n_tok) /* not enough tokens, we are done */ break; /* ok, reduce no of tokens, dequeue and send the packet. */ t->tbf_n_tok -= len; t->tbf_q = m->m_act; if (--t->tbf_q_len == 0) t->tbf_t = NULL; m->m_act = NULL; tbf_send_packet(vifp, m); } } static void tbf_reprocess_q(void *xvifp) { struct vif *vifp = xvifp; if (ip_mrouter == NULL) return; VIF_LOCK(); tbf_update_tokens(vifp); tbf_process_q(vifp); if (vifp->v_tbf->tbf_q_len) callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); VIF_UNLOCK(); } /* function that will selectively discard a member of the queue * based on the precedence value and the priority */ static int tbf_dq_sel(struct vif *vifp, struct ip *ip) { u_int p; struct mbuf *m, *last; struct mbuf **np; struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); p = priority(vifp, ip); np = &t->tbf_q; last = NULL; while ((m = *np) != NULL) { if (p > priority(vifp, mtod(m, struct ip *))) { *np = m->m_act; /* If we're removing the last packet, fix the tail pointer */ if (m == t->tbf_t) t->tbf_t = last; m_freem(m); /* It's impossible for the queue to be empty, but check anyways. */ if (--t->tbf_q_len == 0) t->tbf_t = NULL; mrtstat.mrts_drop_sel++; return 1; } np = &m->m_act; last = m; } return 0; } static void tbf_send_packet(struct vif *vifp, struct mbuf *m) { VIF_LOCK_ASSERT(); if (vifp->v_flags & VIFF_TUNNEL) /* If tunnel options */ ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); else { struct ip_moptions imo; int error; static struct route ro; /* XXX check this */ imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL); if (mrtdebug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on vif %d err %d\n", (int)(vifp - viftable), error); } } /* determine the current time and then * the elapsed time (between the last time and time now) * in milliseconds & update the no. of tokens in the bucket */ static void tbf_update_tokens(struct vif *vifp) { struct timeval tp; u_long tm; struct tbf *t = vifp->v_tbf; VIF_LOCK_ASSERT(); GET_TIME(tp); TV_DELTA(tp, t->tbf_last_pkt_t, tm); /* * This formula is actually * "time in seconds" * "bytes/second". * * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) * * The (1000/1024) was introduced in add_vif to optimize * this divide into a shift. */ t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; t->tbf_last_pkt_t = tp; if (t->tbf_n_tok > MAX_BKT_SIZE) t->tbf_n_tok = MAX_BKT_SIZE; } static int priority(struct vif *vifp, struct ip *ip) { int prio = 50; /* the lowest priority -- default case */ /* temporary hack; may add general packet classifier some day */ /* * The UDP port space is divided up into four priority ranges: * [0, 16384) : unclassified - lowest priority * [16384, 32768) : audio - highest priority * [32768, 49152) : whiteboard - medium priority * [49152, 65536) : video - low priority * * Everything else gets lowest priority. */ if (ip->ip_p == IPPROTO_UDP) { struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); switch (ntohs(udp->uh_dport) & 0xc000) { case 0x4000: prio = 70; break; case 0x8000: prio = 60; break; case 0xc000: prio = 55; break; } } return prio; } /* * End of token bucket filter modifications */ static int X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) { int error, vifi; if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) return error; VIF_LOCK(); if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ VIF_UNLOCK(); return EADDRNOTAVAIL; } if (sopt->sopt_name == IP_RSVP_VIF_ON) { /* Check if socket is available. */ if (viftable[vifi].v_rsvpd != NULL) { VIF_UNLOCK(); return EADDRINUSE; } viftable[vifi].v_rsvpd = so; /* This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 1; rsvp_on++; } } else { /* must be VIF_OFF */ /* * XXX as an additional consistency check, one could make sure * that viftable[vifi].v_rsvpd == so, otherwise passing so as * first parameter is pretty useless. */ viftable[vifi].v_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } VIF_UNLOCK(); return 0; } static void X_ip_rsvp_force_done(struct socket *so) { int vifi; /* Don't bother if it is not the right type of socket. */ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return; VIF_LOCK(); /* The socket may be attached to more than one vif...this * is perfectly legal. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_rsvpd == so) { viftable[vifi].v_rsvpd = NULL; /* This may seem silly, but we need to be sure we don't * over-decrement the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } } VIF_UNLOCK(); } static void X_rsvp_input(struct mbuf *m, int off) { int vifi; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; struct ifnet *ifp; if (rsvpdebug) printf("rsvp_input: rsvp_on %d\n",rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } if (rsvpdebug) printf("rsvp_input: check vifs\n"); #ifdef DIAGNOSTIC M_ASSERTPKTHDR(m); #endif ifp = m->m_pkthdr.rcvif; VIF_LOCK(); /* Find which vif the packet arrived on. */ for (vifi = 0; vifi < numvifs; vifi++) if (viftable[vifi].v_ifp == ifp) break; if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { /* * Drop the lock here to avoid holding it across rip_input. * This could make rsvpdebug printfs wrong. If you care, * record the state of stuff before dropping the lock. */ VIF_UNLOCK(); /* * If the old-style non-vif-associated socket is set, * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ if (ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ } else { if (rsvpdebug && vifi == numvifs) printf("rsvp_input: Can't find vif for packet.\n"); else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) printf("rsvp_input: No socket defined for vif %d\n",vifi); m_freem(m); } return; } rsvp_src.sin_addr = ip->ip_src; if (rsvpdebug && m) printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { if (rsvpdebug) printf("rsvp_input: Failed to append to socket\n"); } else { if (rsvpdebug) printf("rsvp_input: send packet up\n"); } VIF_UNLOCK(); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; GET_TIME(now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &bw_upcalls[bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = bw_upcalls_n * sizeof(bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (bw_upcalls_n == 0) return; /* No pending upcalls */ bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m->m_len = m->m_pkthdr.len = 0; m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { static uint32_t last_tv_sec; /* last time we processed this */ uint32_t loops; int i; struct timeval now, process_endtime; GET_TIME(now); if (last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - last_tv_sec; last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = bw_meter_timers[i]; bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = bw_meter_timers[time_hash]; bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *unused) { MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, NULL); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *unused) { if (mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); } /* * End of bandwidth monitoring code */ #ifdef PIM /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_register_send: "); mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((mrt_api_config & MRT_MFC_RP) && (rt->mfc_rp.s_addr != INADDR_ANY)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ip->ip_len <= mtu) { /* Turn the IP header into a valid one */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { if (mrtdebug & DEBUG_PIM) log(LOG_WARNING, "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); ++mrtstat.mrts_upq_sockfull; return ENOBUFS; } /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_id = ip_newid(); ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); ip_outer->ip_src = viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ntohs(ip->ip_off) & IP_DF) ip_outer->ip_off |= IP_DF; pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_first); else tbf_control(vifp, mb_first, ip, ip_outer->ip_len); /* Keep statistics */ pimstat.pims_snd_registers_msgs++; pimstat.pims_snd_registers_bytes += len; return 0; } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ void pim_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); struct pim *pim; int minlen; int datalen = ip->ip_len; int ip_tos; int iphlen = off; /* Keep statistics */ pimstat.pims_rcv_total_msgs++; pimstat.pims_rcv_total_bytes += datalen; /* * Validate lengths */ if (datalen < PIM_MINLEN) { pimstat.pims_rcv_tooshort++; log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { log(LOG_ERR, "pim_input: m_pullup failure\n"); return; } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { pimstat.pims_rcv_badsum++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: invalid checksum"); m_freem(m); return; } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { pimstat.pims_rcv_badversion++; log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return; } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: register vif not set: %d\n", reg_vif_num); m_freem(m); return; } /* XXX need refcnt? */ vifp = viftable[reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { pimstat.pims_rcv_tooshort++; pimstat.pims_rcv_badregisters++; log(LOG_ERR, "pim_input: register packet size too small %d from %lx\n", datalen, (u_long)ip->ip_src.s_addr); m_freem(m); return; } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), ntohs(encap_ip->ip_len)); } /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: invalid IP version (%d) " "of the inner packet\n", encap_ip->ip_v); } m_freem(m); return; } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { pimstat.pims_rcv_badregisters++; if (mrtdebug & DEBUG_PIM) log(LOG_DEBUG, "pim_input: inner packet of register is not " "multicast %lx\n", (u_long)ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return; } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); if (mcp == NULL) { log(LOG_ERR, "pim_input: pim register: could not copy register head\n"); m_freem(m); return; } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ pimstat.pims_rcv_registers_msgs++; pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); if (mrtdebug & DEBUG_PIM) { log(LOG_DEBUG, "pim_input: forwarding decapsulated register: " "src %lx, dst %lx, vif %d\n", (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), reg_vif_num); } /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ rip_input(m, iphlen); return; } #endif /* PIM */ static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: mtx_init(&mrouter_mtx, "mrouter initialization", NULL, MTX_DEF); MFC_LOCK_INIT(); VIF_LOCK_INIT(); ip_mrouter_reset(); ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ if (ip_mrouter) return EINVAL; X_ip_mrouter_done(); ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); mtx_destroy(&mrouter_mtx); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/libalias/alias_db.c =================================================================== --- head/sys/netinet/libalias/alias_db.c (revision 150349) +++ head/sys/netinet/libalias/alias_db.c (revision 150350) @@ -1,2815 +1,2815 @@ /*- * Copyright (c) 2001 Charles Mott * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Alias_db.c encapsulates all data structures used for storing packet aliasing data. Other parts of the aliasing software access data through functions provided in this file. Data storage is based on the notion of a "link", which is established for ICMP echo/reply packets, UDP datagrams and TCP stream connections. A link stores the original source and destination addresses. For UDP and TCP, it also stores source and destination port numbers, as well as an alias port number. Links are also used to store information about fragments. There is a facility for sweeping through and deleting old links as new packets are sent through. A simple timeout is used for ICMP and UDP links. TCP links are left alone unless there is an incomplete connection, in which case the link can be deleted after a certain amount of time. Initial version: August, 1996 (cjm) Version 1.4: September 16, 1996 (cjm) Facility for handling incoming links added. Version 1.6: September 18, 1996 (cjm) ICMP data handling simplified. Version 1.7: January 9, 1997 (cjm) Fragment handling simplified. Saves pointers for unresolved fragments. Permits links for unspecified remote ports or unspecified remote addresses. Fixed bug which did not properly zero port table entries after a link was deleted. Cleaned up some obsolete comments. Version 1.8: January 14, 1997 (cjm) Fixed data type error in StartPoint(). (This error did not exist prior to v1.7 and was discovered and fixed by Ari Suutari) Version 1.9: February 1, 1997 Optionally, connections initiated from packet aliasing host machine will will not have their port number aliased unless it conflicts with an aliasing port already being used. (cjm) All options earlier being #ifdef'ed are now available through a new interface, SetPacketAliasMode(). This allows run time control (which is now available in PPP+pktAlias through the 'alias' keyword). (ee) Added ability to create an alias port without either destination address or port specified. port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee) Removed K&R style function headers and general cleanup. (ee) Added packetAliasMode to replace compiler #defines's (ee) Allocates sockets for partially specified ports if ALIAS_USE_SOCKETS defined. (cjm) Version 2.0: March, 1997 SetAliasAddress() will now clean up alias links if the aliasing address is changed. (cjm) PacketAliasPermanentLink() function added to support permanent links. (J. Fortes suggested the need for this.) Examples: (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port (192.168.0.2, port 21) <-> alias port 3604, known dest addr unknown dest port These permanent links allow for incoming connections to machines on the local network. They can be given with a user-chosen amount of specificity, with increasing specificity meaning more security. (cjm) Quite a bit of rework to the basic engine. The portTable[] array, which kept track of which ports were in use was replaced by a table/linked list structure. (cjm) SetExpire() function added. (cjm) DeleteLink() no longer frees memory association with a pointer to a fragment (this bug was first recognized by E. Eklund in v1.9). Version 2.1: May, 1997 (cjm) Packet aliasing engine reworked so that it can handle multiple external addresses rather than just a single host address. PacketAliasRedirectPort() and PacketAliasRedirectAddr() added to the API. The first function is a more generalized version of PacketAliasPermanentLink(). The second function implements static network address translation. Version 3.2: July, 2000 (salander and satoh) Added FindNewPortGroup to get contiguous range of port values. Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing link but not actually add one. Added FindRtspOut, which is closely derived from FindUdpTcpOut, except that the alias port (from FindNewPortGroup) is provided as input. See HISTORY file for additional revisions. */ #ifdef _KERNEL #include #else #include #endif #include #include #include #include #ifdef _KERNEL #include #include #include #include #else #include #include #include #include #endif /* BSD network include files */ #include #include #include #include #ifdef _KERNEL #include #include #else #include "alias.h" #include "alias_local.h" #endif static LIST_HEAD(, libalias) instancehead = LIST_HEAD_INITIALIZER(instancehead); /* Constants (note: constants are also defined near relevant functions or structs) */ /* Parameters used for cleanup of expired links */ #define ALIAS_CLEANUP_INTERVAL_SECS 60 #define ALIAS_CLEANUP_MAX_SPOKES 30 /* Timeouts (in seconds) for different link types */ #define ICMP_EXPIRE_TIME 60 #define UDP_EXPIRE_TIME 60 #define PROTO_EXPIRE_TIME 60 #define FRAGMENT_ID_EXPIRE_TIME 10 #define FRAGMENT_PTR_EXPIRE_TIME 30 /* TCP link expire time for different cases */ /* When the link has been used and closed - minimal grace time to allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */ #ifndef TCP_EXPIRE_DEAD #define TCP_EXPIRE_DEAD 10 #endif /* When the link has been used and closed on one side - the other side is allowed to still send data */ #ifndef TCP_EXPIRE_SINGLEDEAD #define TCP_EXPIRE_SINGLEDEAD 90 #endif /* When the link isn't yet up */ #ifndef TCP_EXPIRE_INITIAL #define TCP_EXPIRE_INITIAL 300 #endif /* When the link is up */ #ifndef TCP_EXPIRE_CONNECTED #define TCP_EXPIRE_CONNECTED 86400 #endif /* Dummy port number codes used for FindLinkIn/Out() and AddLink(). These constants can be anything except zero, which indicates an unknown port number. */ #define NO_DEST_PORT 1 #define NO_SRC_PORT 1 /* Data Structures The fundamental data structure used in this program is "struct alias_link". Whenever a TCP connection is made, a UDP datagram is sent out, or an ICMP echo request is made, a link record is made (if it has not already been created). The link record is identified by the source address/port and the destination address/port. In the case of an ICMP echo request, the source port is treated as being equivalent with the 16-bit ID number of the ICMP packet. The link record also can store some auxiliary data. For TCP connections that have had sequence and acknowledgment modifications, data space is available to track these changes. A state field is used to keep track in changes to the TCP connection state. ID numbers of fragments can also be stored in the auxiliary space. Pointers to unresolved fragments can also be stored. The link records support two independent chainings. Lookup tables for input and out tables hold the initial pointers the link chains. On input, the lookup table indexes on alias port and link type. On output, the lookup table indexes on source address, destination address, source port, destination port and link type. */ struct ack_data_record { /* used to save changes to ACK/sequence * numbers */ u_long ack_old; u_long ack_new; int delta; int active; }; struct tcp_state { /* Information about TCP connection */ int in; /* State for outside -> inside */ int out; /* State for inside -> outside */ int index; /* Index to ACK data array */ int ack_modified; /* Indicates whether ACK and * sequence numbers */ /* been modified */ }; #define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes * saved for a modified TCP stream */ struct tcp_dat { struct tcp_state state; struct ack_data_record ack[N_LINK_TCP_DATA]; int fwhole; /* Which firewall record is used for this * hole? */ }; struct server { /* LSNAT server pool (circular list) */ struct in_addr addr; u_short port; struct server *next; }; struct alias_link { /* Main data structure */ struct libalias *la; struct in_addr src_addr; /* Address and port information */ struct in_addr dst_addr; struct in_addr alias_addr; struct in_addr proxy_addr; u_short src_port; u_short dst_port; u_short alias_port; u_short proxy_port; struct server *server; int link_type; /* Type of link: TCP, UDP, ICMP, * proto, frag */ /* values for link_type */ #define LINK_ICMP IPPROTO_ICMP #define LINK_UDP IPPROTO_UDP #define LINK_TCP IPPROTO_TCP #define LINK_FRAGMENT_ID (IPPROTO_MAX + 1) #define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2) #define LINK_ADDR (IPPROTO_MAX + 3) #define LINK_PPTP (IPPROTO_MAX + 4) int flags; /* indicates special characteristics */ int pflags; /* protocol-specific flags */ /* flag bits */ #define LINK_UNKNOWN_DEST_PORT 0x01 #define LINK_UNKNOWN_DEST_ADDR 0x02 #define LINK_PERMANENT 0x04 #define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */ #define LINK_UNFIREWALLED 0x08 int timestamp; /* Time link was last accessed */ int expire_time; /* Expire time for link */ #ifndef NO_USE_SOCKETS int sockfd; /* socket descriptor */ #endif LIST_ENTRY (alias_link) list_out; /* Linked list of * pointers for */ LIST_ENTRY (alias_link) list_in; /* input and output * lookup tables */ union { /* Auxiliary data */ char *frag_ptr; struct in_addr frag_addr; struct tcp_dat *tcp; } data; }; /* Clean up procedure. */ static void finishoff(void); /* Kernel module definition. */ #ifdef _KERNEL MALLOC_DEFINE(M_ALIAS, "libalias", "packet aliasing"); MODULE_VERSION(libalias, 1); static int alias_mod_handler(module_t mod, int type, void *data) { int error; switch (type) { case MOD_LOAD: error = 0; break; case MOD_QUIESCE: case MOD_UNLOAD: finishoff(); error = 0; break; default: error = EINVAL; } return (error); } static moduledata_t alias_mod = { "alias", alias_mod_handler, NULL }; DECLARE_MODULE(alias, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); #endif /* Internal utility routines (used only in alias_db.c) Lookup table starting points: StartPointIn() -- link table initial search point for incoming packets StartPointOut() -- link table initial search point for outgoing packets Miscellaneous: SeqDiff() -- difference between two TCP sequences ShowAliasStats() -- send alias statistics to a monitor file */ /* Local prototypes */ static u_int StartPointIn(struct in_addr, u_short, int); static u_int StartPointOut(struct in_addr, struct in_addr, u_short, u_short, int); static int SeqDiff(u_long, u_long); #ifndef NO_FW_PUNCH /* Firewall control */ static void InitPunchFW(struct libalias *); static void UninitPunchFW(struct libalias *); static void ClearFWHole(struct alias_link *); #endif #ifndef NO_LOGGING /* Log file control */ static void ShowAliasStats(struct libalias *); static void InitPacketAliasLog(struct libalias *); static void UninitPacketAliasLog(struct libalias *); #endif static u_int StartPointIn(struct in_addr alias_addr, u_short alias_port, int link_type) { u_int n; n = alias_addr.s_addr; if (link_type != LINK_PPTP) n += alias_port; n += link_type; return (n % LINK_TABLE_IN_SIZE); } static u_int StartPointOut(struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, int link_type) { u_int n; n = src_addr.s_addr; n += dst_addr.s_addr; if (link_type != LINK_PPTP) { n += src_port; n += dst_port; } n += link_type; return (n % LINK_TABLE_OUT_SIZE); } static int SeqDiff(u_long x, u_long y) { /* Return the difference between two TCP sequence numbers */ /* This function is encapsulated in case there are any unusual arithmetic conditions that need to be considered. */ return (ntohl(y) - ntohl(x)); } #ifndef NO_LOGGING static void ShowAliasStats(struct libalias *la) { /* Used for debugging */ if (la->monitorFile) { fprintf(la->monitorFile, "icmp=%d, udp=%d, tcp=%d, pptp=%d, proto=%d, frag_id=%d frag_ptr=%d", la->icmpLinkCount, la->udpLinkCount, la->tcpLinkCount, la->pptpLinkCount, la->protoLinkCount, la->fragmentIdLinkCount, la->fragmentPtrLinkCount); fprintf(la->monitorFile, " / tot=%d (sock=%d)\n", la->icmpLinkCount + la->udpLinkCount + la->tcpLinkCount + la->pptpLinkCount + la->protoLinkCount + la->fragmentIdLinkCount + la->fragmentPtrLinkCount, la->sockCount); fflush(la->monitorFile); } } #endif /* Internal routines for finding, deleting and adding links Port Allocation: GetNewPort() -- find and reserve new alias port number GetSocket() -- try to allocate a socket for a given port Link creation and deletion: CleanupAliasData() - remove all link chains from lookup table IncrementalCleanup() - look for stale links in a single chain DeleteLink() - remove link AddLink() - add link ReLink() - change link Link search: FindLinkOut() - find link for outgoing packets FindLinkIn() - find link for incoming packets Port search: FindNewPortGroup() - find an available group of ports */ /* Local prototypes */ static int GetNewPort(struct libalias *, struct alias_link *, int); #ifndef NO_USE_SOCKETS static u_short GetSocket(struct libalias *, u_short, int *, int); #endif static void CleanupAliasData(struct libalias *); static void IncrementalCleanup(struct libalias *); static void DeleteLink(struct alias_link *); static struct alias_link * AddLink(struct libalias *, struct in_addr, struct in_addr, struct in_addr, u_short, u_short, int, int); static struct alias_link * ReLink(struct alias_link *, struct in_addr, struct in_addr, struct in_addr, u_short, u_short, int, int); static struct alias_link * FindLinkOut (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int); static struct alias_link * FindLinkIn (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int); #define ALIAS_PORT_BASE 0x08000 #define ALIAS_PORT_MASK 0x07fff #define ALIAS_PORT_MASK_EVEN 0x07ffe #define GET_NEW_PORT_MAX_ATTEMPTS 20 #define GET_ALIAS_PORT -1 #define GET_ALIAS_ID GET_ALIAS_PORT #define FIND_EVEN_ALIAS_BASE 1 /* GetNewPort() allocates port numbers. Note that if a port number is already in use, that does not mean that it cannot be used by another link concurrently. This is because GetNewPort() looks for unused triplets: (dest addr, dest port, alias port). */ static int GetNewPort(struct libalias *la, struct alias_link *lnk, int alias_port_param) { int i; int max_trials; u_short port_sys; u_short port_net; /* Description of alias_port_param for GetNewPort(). When this parameter is zero or positive, it precisely specifies the port number. GetNewPort() will return this number without check that it is in use. When this parameter is GET_ALIAS_PORT, it indicates to get a randomly selected port number. */ if (alias_port_param == GET_ALIAS_PORT) { /* * The aliasing port is automatically selected by one of * two methods below: */ max_trials = GET_NEW_PORT_MAX_ATTEMPTS; if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) { /* * When the PKT_ALIAS_SAME_PORTS option is chosen, * the first try will be the actual source port. If * this is already in use, the remainder of the * trials will be random. */ port_net = lnk->src_port; port_sys = ntohs(port_net); } else { /* First trial and all subsequent are random. */ port_sys = random() & ALIAS_PORT_MASK; port_sys += ALIAS_PORT_BASE; port_net = htons(port_sys); } } else if (alias_port_param >= 0 && alias_port_param < 0x10000) { lnk->alias_port = (u_short) alias_port_param; return (0); } else { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/GetNewPort(): "); fprintf(stderr, "input parameter error\n"); #endif return (-1); } /* Port number search */ for (i = 0; i < max_trials; i++) { int go_ahead; struct alias_link *search_result; search_result = FindLinkIn(la, lnk->dst_addr, lnk->alias_addr, lnk->dst_port, port_net, lnk->link_type, 0); if (search_result == NULL) go_ahead = 1; else if (!(lnk->flags & LINK_PARTIALLY_SPECIFIED) && (search_result->flags & LINK_PARTIALLY_SPECIFIED)) go_ahead = 1; else go_ahead = 0; if (go_ahead) { #ifndef NO_USE_SOCKETS if ((la->packetAliasMode & PKT_ALIAS_USE_SOCKETS) && (lnk->flags & LINK_PARTIALLY_SPECIFIED) && ((lnk->link_type == LINK_TCP) || (lnk->link_type == LINK_UDP))) { if (GetSocket(la, port_net, &lnk->sockfd, lnk->link_type)) { lnk->alias_port = port_net; return (0); } } else { #endif lnk->alias_port = port_net; return (0); #ifndef NO_USE_SOCKETS } #endif } port_sys = random() & ALIAS_PORT_MASK; port_sys += ALIAS_PORT_BASE; port_net = htons(port_sys); } #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/GetnewPort(): "); fprintf(stderr, "could not find free port\n"); #endif return (-1); } #ifndef NO_USE_SOCKETS static u_short GetSocket(struct libalias *la, u_short port_net, int *sockfd, int link_type) { int err; int sock; struct sockaddr_in sock_addr; if (link_type == LINK_TCP) sock = socket(AF_INET, SOCK_STREAM, 0); else if (link_type == LINK_UDP) sock = socket(AF_INET, SOCK_DGRAM, 0); else { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/GetSocket(): "); fprintf(stderr, "incorrect link type\n"); #endif return (0); } if (sock < 0) { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/GetSocket(): "); fprintf(stderr, "socket() error %d\n", *sockfd); #endif return (0); } sock_addr.sin_family = AF_INET; sock_addr.sin_addr.s_addr = htonl(INADDR_ANY); sock_addr.sin_port = port_net; err = bind(sock, (struct sockaddr *)&sock_addr, sizeof(sock_addr)); if (err == 0) { la->sockCount++; *sockfd = sock; return (1); } else { close(sock); return (0); } } #endif /* FindNewPortGroup() returns a base port number for an available range of contiguous port numbers. Note that if a port number is already in use, that does not mean that it cannot be used by another link concurrently. This is because FindNewPortGroup() looks for unused triplets: (dest addr, dest port, alias port). */ int FindNewPortGroup(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, u_short port_count, u_char proto, u_char align) { int i, j; int max_trials; u_short port_sys; int link_type; /* * Get link_type from protocol */ switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; break; case IPPROTO_TCP: link_type = LINK_TCP; break; default: return (0); break; } /* * The aliasing port is automatically selected by one of two * methods below: */ max_trials = GET_NEW_PORT_MAX_ATTEMPTS; if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) { /* * When the ALIAS_SAME_PORTS option is chosen, the first * try will be the actual source port. If this is already * in use, the remainder of the trials will be random. */ port_sys = ntohs(src_port); } else { /* First trial and all subsequent are random. */ if (align == FIND_EVEN_ALIAS_BASE) port_sys = random() & ALIAS_PORT_MASK_EVEN; else port_sys = random() & ALIAS_PORT_MASK; port_sys += ALIAS_PORT_BASE; } /* Port number search */ for (i = 0; i < max_trials; i++) { struct alias_link *search_result; for (j = 0; j < port_count; j++) if (0 != (search_result = FindLinkIn(la, dst_addr, alias_addr, dst_port, htons(port_sys + j), link_type, 0))) break; /* Found a good range, return base */ if (j == port_count) return (htons(port_sys)); /* Find a new base to try */ if (align == FIND_EVEN_ALIAS_BASE) port_sys = random() & ALIAS_PORT_MASK_EVEN; else port_sys = random() & ALIAS_PORT_MASK; port_sys += ALIAS_PORT_BASE; } #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/FindNewPortGroup(): "); fprintf(stderr, "could not find free port(s)\n"); #endif return (0); } static void CleanupAliasData(struct libalias *la) { struct alias_link *lnk; int i, icount; icount = 0; for (i = 0; i < LINK_TABLE_OUT_SIZE; i++) { lnk = LIST_FIRST(&la->linkTableOut[i]); while (lnk != NULL) { struct alias_link *link_next; link_next = LIST_NEXT(lnk, list_out); icount++; DeleteLink(lnk); lnk = link_next; } } la->cleanupIndex = 0; } static void IncrementalCleanup(struct libalias *la) { int icount; struct alias_link *lnk; icount = 0; lnk = LIST_FIRST(&la->linkTableOut[la->cleanupIndex++]); while (lnk != NULL) { int idelta; struct alias_link *link_next; link_next = LIST_NEXT(lnk, list_out); idelta = la->timeStamp - lnk->timestamp; switch (lnk->link_type) { case LINK_TCP: if (idelta > lnk->expire_time) { struct tcp_dat *tcp_aux; tcp_aux = lnk->data.tcp; if (tcp_aux->state.in != ALIAS_TCP_STATE_CONNECTED || tcp_aux->state.out != ALIAS_TCP_STATE_CONNECTED) { DeleteLink(lnk); icount++; } } break; default: if (idelta > lnk->expire_time) { DeleteLink(lnk); icount++; } break; } lnk = link_next; } if (la->cleanupIndex == LINK_TABLE_OUT_SIZE) la->cleanupIndex = 0; } static void DeleteLink(struct alias_link *lnk) { struct libalias *la = lnk->la; /* Don't do anything if the link is marked permanent */ if (la->deleteAllLinks == 0 && lnk->flags & LINK_PERMANENT) return; #ifndef NO_FW_PUNCH /* Delete associated firewall hole, if any */ ClearFWHole(lnk); #endif /* Free memory allocated for LSNAT server pool */ if (lnk->server != NULL) { struct server *head, *curr, *next; head = curr = lnk->server; do { next = curr->next; free(curr); } while ((curr = next) != head); } /* Adjust output table pointers */ LIST_REMOVE(lnk, list_out); /* Adjust input table pointers */ LIST_REMOVE(lnk, list_in); #ifndef NO_USE_SOCKETS /* Close socket, if one has been allocated */ if (lnk->sockfd != -1) { la->sockCount--; close(lnk->sockfd); } #endif /* Link-type dependent cleanup */ switch (lnk->link_type) { case LINK_ICMP: la->icmpLinkCount--; break; case LINK_UDP: la->udpLinkCount--; break; case LINK_TCP: la->tcpLinkCount--; free(lnk->data.tcp); break; case LINK_PPTP: la->pptpLinkCount--; break; case LINK_FRAGMENT_ID: la->fragmentIdLinkCount--; break; case LINK_FRAGMENT_PTR: la->fragmentPtrLinkCount--; if (lnk->data.frag_ptr != NULL) free(lnk->data.frag_ptr); break; case LINK_ADDR: break; default: la->protoLinkCount--; break; } /* Free memory */ free(lnk); #ifndef NO_LOGGING /* Write statistics, if logging enabled */ if (la->packetAliasMode & PKT_ALIAS_LOG) { ShowAliasStats(la); } #endif } static struct alias_link * AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, int alias_port_param, /* if less than zero, alias */ int link_type) { /* port will be automatically *//* chosen. * If greater than */ u_int start_point; /* zero, equal to alias port */ struct alias_link *lnk; lnk = malloc(sizeof(struct alias_link)); if (lnk != NULL) { /* Basic initialization */ lnk->la = la; lnk->src_addr = src_addr; lnk->dst_addr = dst_addr; lnk->alias_addr = alias_addr; lnk->proxy_addr.s_addr = INADDR_ANY; lnk->src_port = src_port; lnk->dst_port = dst_port; lnk->proxy_port = 0; lnk->server = NULL; lnk->link_type = link_type; #ifndef NO_USE_SOCKETS lnk->sockfd = -1; #endif lnk->flags = 0; lnk->pflags = 0; lnk->timestamp = la->timeStamp; /* Expiration time */ switch (link_type) { case LINK_ICMP: lnk->expire_time = ICMP_EXPIRE_TIME; break; case LINK_UDP: lnk->expire_time = UDP_EXPIRE_TIME; break; case LINK_TCP: lnk->expire_time = TCP_EXPIRE_INITIAL; break; case LINK_PPTP: lnk->flags |= LINK_PERMANENT; /* no timeout. */ break; case LINK_FRAGMENT_ID: lnk->expire_time = FRAGMENT_ID_EXPIRE_TIME; break; case LINK_FRAGMENT_PTR: lnk->expire_time = FRAGMENT_PTR_EXPIRE_TIME; break; case LINK_ADDR: break; default: lnk->expire_time = PROTO_EXPIRE_TIME; break; } /* Determine alias flags */ if (dst_addr.s_addr == INADDR_ANY) lnk->flags |= LINK_UNKNOWN_DEST_ADDR; if (dst_port == 0) lnk->flags |= LINK_UNKNOWN_DEST_PORT; /* Determine alias port */ if (GetNewPort(la, lnk, alias_port_param) != 0) { free(lnk); return (NULL); } /* Link-type dependent initialization */ switch (link_type) { struct tcp_dat *aux_tcp; case LINK_ICMP: la->icmpLinkCount++; break; case LINK_UDP: la->udpLinkCount++; break; case LINK_TCP: aux_tcp = malloc(sizeof(struct tcp_dat)); if (aux_tcp != NULL) { int i; la->tcpLinkCount++; aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED; aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED; aux_tcp->state.index = 0; aux_tcp->state.ack_modified = 0; for (i = 0; i < N_LINK_TCP_DATA; i++) aux_tcp->ack[i].active = 0; aux_tcp->fwhole = -1; lnk->data.tcp = aux_tcp; } else { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/AddLink: "); fprintf(stderr, " cannot allocate auxiliary TCP data\n"); #endif free(lnk); return (NULL); } break; case LINK_PPTP: la->pptpLinkCount++; break; case LINK_FRAGMENT_ID: la->fragmentIdLinkCount++; break; case LINK_FRAGMENT_PTR: la->fragmentPtrLinkCount++; break; case LINK_ADDR: break; default: la->protoLinkCount++; break; } /* Set up pointers for output lookup table */ start_point = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); LIST_INSERT_HEAD(&la->linkTableOut[start_point], lnk, list_out); /* Set up pointers for input lookup table */ start_point = StartPointIn(alias_addr, lnk->alias_port, link_type); LIST_INSERT_HEAD(&la->linkTableIn[start_point], lnk, list_in); } else { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/AddLink(): "); fprintf(stderr, "malloc() call failed.\n"); #endif } #ifndef NO_LOGGING if (la->packetAliasMode & PKT_ALIAS_LOG) { ShowAliasStats(la); } #endif return (lnk); } static struct alias_link * ReLink(struct alias_link *old_lnk, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, int alias_port_param, /* if less than zero, alias */ int link_type) { /* port will be automatically *//* chosen. * If greater than */ struct alias_link *new_lnk; /* zero, equal to alias port */ struct libalias *la = old_lnk->la; new_lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, dst_port, alias_port_param, link_type); #ifndef NO_FW_PUNCH if (new_lnk != NULL && old_lnk->link_type == LINK_TCP && old_lnk->data.tcp->fwhole > 0) { PunchFWHole(new_lnk); } #endif DeleteLink(old_lnk); return (new_lnk); } static struct alias_link * _FindLinkOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, int link_type, int replace_partial_links) { u_int i; struct alias_link *lnk; i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) { if (lnk->src_addr.s_addr == src_addr.s_addr && lnk->server == NULL && lnk->dst_addr.s_addr == dst_addr.s_addr && lnk->dst_port == dst_port && lnk->src_port == src_port && lnk->link_type == link_type) { lnk->timestamp = la->timeStamp; break; } } /* Search for partially specified links. */ if (lnk == NULL && replace_partial_links) { if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY) { lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, 0, link_type, 0); if (lnk == NULL) lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port, dst_port, link_type, 0); } if (lnk == NULL && (dst_port != 0 || dst_addr.s_addr != INADDR_ANY)) { lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port, 0, link_type, 0); } if (lnk != NULL) { lnk = ReLink(lnk, src_addr, dst_addr, lnk->alias_addr, src_port, dst_port, lnk->alias_port, link_type); } } return (lnk); } static struct alias_link * FindLinkOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, int link_type, int replace_partial_links) { struct alias_link *lnk; lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, replace_partial_links); if (lnk == NULL) { /* * The following allows permanent links to be specified as * using the default source address (i.e. device interface * address) without knowing in advance what that address * is. */ if (la->aliasAddress.s_addr != INADDR_ANY && src_addr.s_addr == la->aliasAddress.s_addr) { lnk = _FindLinkOut(la, la->nullAddress, dst_addr, src_port, dst_port, link_type, replace_partial_links); } } return (lnk); } static struct alias_link * _FindLinkIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short dst_port, u_short alias_port, int link_type, int replace_partial_links) { int flags_in; u_int start_point; struct alias_link *lnk; struct alias_link *lnk_fully_specified; struct alias_link *lnk_unknown_all; struct alias_link *lnk_unknown_dst_addr; struct alias_link *lnk_unknown_dst_port; /* Initialize pointers */ lnk_fully_specified = NULL; lnk_unknown_all = NULL; lnk_unknown_dst_addr = NULL; lnk_unknown_dst_port = NULL; /* If either the dest addr or port is unknown, the search loop will have to know about this. */ flags_in = 0; if (dst_addr.s_addr == INADDR_ANY) flags_in |= LINK_UNKNOWN_DEST_ADDR; if (dst_port == 0) flags_in |= LINK_UNKNOWN_DEST_PORT; /* Search loop */ start_point = StartPointIn(alias_addr, alias_port, link_type); LIST_FOREACH(lnk, &la->linkTableIn[start_point], list_in) { int flags; flags = flags_in | lnk->flags; if (!(flags & LINK_PARTIALLY_SPECIFIED)) { if (lnk->alias_addr.s_addr == alias_addr.s_addr && lnk->alias_port == alias_port && lnk->dst_addr.s_addr == dst_addr.s_addr && lnk->dst_port == dst_port && lnk->link_type == link_type) { lnk_fully_specified = lnk; break; } } else if ((flags & LINK_UNKNOWN_DEST_ADDR) && (flags & LINK_UNKNOWN_DEST_PORT)) { if (lnk->alias_addr.s_addr == alias_addr.s_addr && lnk->alias_port == alias_port && lnk->link_type == link_type) { if (lnk_unknown_all == NULL) lnk_unknown_all = lnk; } } else if (flags & LINK_UNKNOWN_DEST_ADDR) { if (lnk->alias_addr.s_addr == alias_addr.s_addr && lnk->alias_port == alias_port && lnk->link_type == link_type && lnk->dst_port == dst_port) { if (lnk_unknown_dst_addr == NULL) lnk_unknown_dst_addr = lnk; } } else if (flags & LINK_UNKNOWN_DEST_PORT) { if (lnk->alias_addr.s_addr == alias_addr.s_addr && lnk->alias_port == alias_port && lnk->link_type == link_type && lnk->dst_addr.s_addr == dst_addr.s_addr) { if (lnk_unknown_dst_port == NULL) lnk_unknown_dst_port = lnk; } } } if (lnk_fully_specified != NULL) { lnk_fully_specified->timestamp = la->timeStamp; lnk = lnk_fully_specified; } else if (lnk_unknown_dst_port != NULL) lnk = lnk_unknown_dst_port; else if (lnk_unknown_dst_addr != NULL) lnk = lnk_unknown_dst_addr; else if (lnk_unknown_all != NULL) lnk = lnk_unknown_all; else return (NULL); if (replace_partial_links && (lnk->flags & LINK_PARTIALLY_SPECIFIED || lnk->server != NULL)) { struct in_addr src_addr; u_short src_port; if (lnk->server != NULL) { /* LSNAT link */ src_addr = lnk->server->addr; src_port = lnk->server->port; lnk->server = lnk->server->next; } else { src_addr = lnk->src_addr; src_port = lnk->src_port; } lnk = ReLink(lnk, src_addr, dst_addr, alias_addr, src_port, dst_port, alias_port, link_type); } return (lnk); } static struct alias_link * FindLinkIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short dst_port, u_short alias_port, int link_type, int replace_partial_links) { struct alias_link *lnk; lnk = _FindLinkIn(la, dst_addr, alias_addr, dst_port, alias_port, link_type, replace_partial_links); if (lnk == NULL) { /* * The following allows permanent links to be specified as * using the default aliasing address (i.e. device * interface address) without knowing in advance what that * address is. */ if (la->aliasAddress.s_addr != INADDR_ANY && alias_addr.s_addr == la->aliasAddress.s_addr) { lnk = _FindLinkIn(la, dst_addr, la->nullAddress, dst_port, alias_port, link_type, replace_partial_links); } } return (lnk); } /* External routines for finding/adding links -- "external" means outside alias_db.c, but within alias*.c -- FindIcmpIn(), FindIcmpOut() FindFragmentIn1(), FindFragmentIn2() AddFragmentPtrLink(), FindFragmentPtr() FindProtoIn(), FindProtoOut() FindUdpTcpIn(), FindUdpTcpOut() AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(), FindPptpOutByPeerCallId(), FindPptpInByPeerCallId() FindOriginalAddress(), FindAliasAddress() (prototypes in alias_local.h) */ struct alias_link * FindIcmpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short id_alias, int create) { struct alias_link *lnk; lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, id_alias, LINK_ICMP, 0); if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { struct in_addr target_addr; target_addr = FindOriginalAddress(la, alias_addr); lnk = AddLink(la, target_addr, dst_addr, alias_addr, id_alias, NO_DEST_PORT, id_alias, LINK_ICMP); } return (lnk); } struct alias_link * FindIcmpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short id, int create) { struct alias_link *lnk; lnk = FindLinkOut(la, src_addr, dst_addr, id, NO_DEST_PORT, LINK_ICMP, 0); if (lnk == NULL && create) { struct in_addr alias_addr; alias_addr = FindAliasAddress(la, src_addr); lnk = AddLink(la, src_addr, dst_addr, alias_addr, id, NO_DEST_PORT, GET_ALIAS_ID, LINK_ICMP); } return (lnk); } struct alias_link * FindFragmentIn1(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short ip_id) { struct alias_link *lnk; lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, ip_id, LINK_FRAGMENT_ID, 0); if (lnk == NULL) { lnk = AddLink(la, la->nullAddress, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, ip_id, LINK_FRAGMENT_ID); } return (lnk); } struct alias_link * FindFragmentIn2(struct libalias *la, struct in_addr dst_addr, /* Doesn't add a link if * one */ struct in_addr alias_addr, /* is not found. */ u_short ip_id) { return FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, ip_id, LINK_FRAGMENT_ID, 0); } struct alias_link * AddFragmentPtrLink(struct libalias *la, struct in_addr dst_addr, u_short ip_id) { return AddLink(la, la->nullAddress, dst_addr, la->nullAddress, NO_SRC_PORT, NO_DEST_PORT, ip_id, LINK_FRAGMENT_PTR); } struct alias_link * FindFragmentPtr(struct libalias *la, struct in_addr dst_addr, u_short ip_id) { return FindLinkIn(la, dst_addr, la->nullAddress, NO_DEST_PORT, ip_id, LINK_FRAGMENT_PTR, 0); } struct alias_link * FindProtoIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_char proto) { struct alias_link *lnk; lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, 0, proto, 1); if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { struct in_addr target_addr; target_addr = FindOriginalAddress(la, alias_addr); lnk = AddLink(la, target_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); } return (lnk); } struct alias_link * FindProtoOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_char proto) { struct alias_link *lnk; lnk = FindLinkOut(la, src_addr, dst_addr, NO_SRC_PORT, NO_DEST_PORT, proto, 1); if (lnk == NULL) { struct in_addr alias_addr; alias_addr = FindAliasAddress(la, src_addr); lnk = AddLink(la, src_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); } return (lnk); } struct alias_link * FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short dst_port, u_short alias_port, u_char proto, int create) { int link_type; struct alias_link *lnk; switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; break; case IPPROTO_TCP: link_type = LINK_TCP; break; default: return (NULL); break; } lnk = FindLinkIn(la, dst_addr, alias_addr, dst_port, alias_port, link_type, create); if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { struct in_addr target_addr; target_addr = FindOriginalAddress(la, alias_addr); lnk = AddLink(la, target_addr, dst_addr, alias_addr, alias_port, dst_port, alias_port, link_type); } return (lnk); } struct alias_link * FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, u_char proto, int create) { int link_type; struct alias_link *lnk; switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; break; case IPPROTO_TCP: link_type = LINK_TCP; break; default: return (NULL); break; } lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create); if (lnk == NULL && create) { struct in_addr alias_addr; alias_addr = FindAliasAddress(la, src_addr); lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, dst_port, GET_ALIAS_PORT, link_type); } return (lnk); } struct alias_link * AddPptp(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_int16_t src_call_id) { struct alias_link *lnk; lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_call_id, 0, GET_ALIAS_PORT, LINK_PPTP); return (lnk); } struct alias_link * FindPptpOutByCallId(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_int16_t src_call_id) { u_int i; struct alias_link *lnk; i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) if (lnk->link_type == LINK_PPTP && lnk->src_addr.s_addr == src_addr.s_addr && lnk->dst_addr.s_addr == dst_addr.s_addr && lnk->src_port == src_call_id) break; return (lnk); } struct alias_link * FindPptpOutByPeerCallId(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_int16_t dst_call_id) { u_int i; struct alias_link *lnk; i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) if (lnk->link_type == LINK_PPTP && lnk->src_addr.s_addr == src_addr.s_addr && lnk->dst_addr.s_addr == dst_addr.s_addr && lnk->dst_port == dst_call_id) break; return (lnk); } struct alias_link * FindPptpInByCallId(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_int16_t dst_call_id) { u_int i; struct alias_link *lnk; i = StartPointIn(alias_addr, 0, LINK_PPTP); LIST_FOREACH(lnk, &la->linkTableIn[i], list_in) if (lnk->link_type == LINK_PPTP && lnk->dst_addr.s_addr == dst_addr.s_addr && lnk->alias_addr.s_addr == alias_addr.s_addr && lnk->dst_port == dst_call_id) break; return (lnk); } struct alias_link * FindPptpInByPeerCallId(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_int16_t alias_call_id) { struct alias_link *lnk; lnk = FindLinkIn(la, dst_addr, alias_addr, 0 /* any */ , alias_call_id, LINK_PPTP, 0); return (lnk); } struct alias_link * FindRtspOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short alias_port, u_char proto) { int link_type; struct alias_link *lnk; switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; break; case IPPROTO_TCP: link_type = LINK_TCP; break; default: return (NULL); break; } lnk = FindLinkOut(la, src_addr, dst_addr, src_port, 0, link_type, 1); if (lnk == NULL) { struct in_addr alias_addr; alias_addr = FindAliasAddress(la, src_addr); lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, 0, alias_port, link_type); } return (lnk); } struct in_addr FindOriginalAddress(struct libalias *la, struct in_addr alias_addr) { struct alias_link *lnk; lnk = FindLinkIn(la, la->nullAddress, alias_addr, 0, 0, LINK_ADDR, 0); if (lnk == NULL) { la->newDefaultLink = 1; if (la->targetAddress.s_addr == INADDR_ANY) return (alias_addr); else if (la->targetAddress.s_addr == INADDR_NONE) return (la->aliasAddress.s_addr != INADDR_ANY) ? la->aliasAddress : alias_addr; else return (la->targetAddress); } else { if (lnk->server != NULL) { /* LSNAT link */ struct in_addr src_addr; src_addr = lnk->server->addr; lnk->server = lnk->server->next; return (src_addr); } else if (lnk->src_addr.s_addr == INADDR_ANY) return (la->aliasAddress.s_addr != INADDR_ANY) ? la->aliasAddress : alias_addr; else return (lnk->src_addr); } } struct in_addr FindAliasAddress(struct libalias *la, struct in_addr original_addr) { struct alias_link *lnk; lnk = FindLinkOut(la, original_addr, la->nullAddress, 0, 0, LINK_ADDR, 0); if (lnk == NULL) { return (la->aliasAddress.s_addr != INADDR_ANY) ? la->aliasAddress : original_addr; } else { if (lnk->alias_addr.s_addr == INADDR_ANY) return (la->aliasAddress.s_addr != INADDR_ANY) ? la->aliasAddress : original_addr; else return (lnk->alias_addr); } } /* External routines for getting or changing link data (external to alias_db.c, but internal to alias*.c) SetFragmentData(), GetFragmentData() SetFragmentPtr(), GetFragmentPtr() SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut() GetOriginalAddress(), GetDestAddress(), GetAliasAddress() GetOriginalPort(), GetAliasPort() SetAckModified(), GetAckModified() GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq() SetProtocolFlags(), GetProtocolFlags() SetDestCallId() */ void SetFragmentAddr(struct alias_link *lnk, struct in_addr src_addr) { lnk->data.frag_addr = src_addr; } void GetFragmentAddr(struct alias_link *lnk, struct in_addr *src_addr) { *src_addr = lnk->data.frag_addr; } void SetFragmentPtr(struct alias_link *lnk, char *fptr) { lnk->data.frag_ptr = fptr; } void GetFragmentPtr(struct alias_link *lnk, char **fptr) { *fptr = lnk->data.frag_ptr; } void SetStateIn(struct alias_link *lnk, int state) { /* TCP input state */ switch (state) { case ALIAS_TCP_STATE_DISCONNECTED: if (lnk->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED) lnk->expire_time = TCP_EXPIRE_DEAD; else lnk->expire_time = TCP_EXPIRE_SINGLEDEAD; break; case ALIAS_TCP_STATE_CONNECTED: if (lnk->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED) lnk->expire_time = TCP_EXPIRE_CONNECTED; break; default: #ifdef _KERNEL panic("libalias:SetStateIn() unknown state"); #else abort(); #endif } lnk->data.tcp->state.in = state; } void SetStateOut(struct alias_link *lnk, int state) { /* TCP output state */ switch (state) { case ALIAS_TCP_STATE_DISCONNECTED: if (lnk->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED) lnk->expire_time = TCP_EXPIRE_DEAD; else lnk->expire_time = TCP_EXPIRE_SINGLEDEAD; break; case ALIAS_TCP_STATE_CONNECTED: if (lnk->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED) lnk->expire_time = TCP_EXPIRE_CONNECTED; break; default: #ifdef _KERNEL panic("libalias:SetStateOut() unknown state"); #else abort(); #endif } lnk->data.tcp->state.out = state; } int GetStateIn(struct alias_link *lnk) { /* TCP input state */ return (lnk->data.tcp->state.in); } int GetStateOut(struct alias_link *lnk) { /* TCP output state */ return (lnk->data.tcp->state.out); } struct in_addr GetOriginalAddress(struct alias_link *lnk) { if (lnk->src_addr.s_addr == INADDR_ANY) return (lnk->la->aliasAddress); else return (lnk->src_addr); } struct in_addr GetDestAddress(struct alias_link *lnk) { return (lnk->dst_addr); } struct in_addr GetAliasAddress(struct alias_link *lnk) { if (lnk->alias_addr.s_addr == INADDR_ANY) return (lnk->la->aliasAddress); else return (lnk->alias_addr); } struct in_addr GetDefaultAliasAddress(struct libalias *la) { return (la->aliasAddress); } void SetDefaultAliasAddress(struct libalias *la, struct in_addr alias_addr) { la->aliasAddress = alias_addr; } u_short GetOriginalPort(struct alias_link *lnk) { return (lnk->src_port); } u_short GetAliasPort(struct alias_link *lnk) { return (lnk->alias_port); } #ifndef NO_FW_PUNCH static u_short GetDestPort(struct alias_link *lnk) { return (lnk->dst_port); } #endif void SetAckModified(struct alias_link *lnk) { /* Indicate that ACK numbers have been modified in a TCP connection */ lnk->data.tcp->state.ack_modified = 1; } struct in_addr GetProxyAddress(struct alias_link *lnk) { return (lnk->proxy_addr); } void SetProxyAddress(struct alias_link *lnk, struct in_addr addr) { lnk->proxy_addr = addr; } u_short GetProxyPort(struct alias_link *lnk) { return (lnk->proxy_port); } void SetProxyPort(struct alias_link *lnk, u_short port) { lnk->proxy_port = port; } int GetAckModified(struct alias_link *lnk) { /* See if ACK numbers have been modified */ return (lnk->data.tcp->state.ack_modified); } int GetDeltaAckIn(struct ip *pip, struct alias_link *lnk) { /* Find out how much the ACK number has been altered for an incoming TCP packet. To do this, a circular list of ACK numbers where the TCP packet size was altered is searched. */ int i; struct tcphdr *tc; int delta, ack_diff_min; u_long ack; tc = ip_next(pip); ack = tc->th_ack; delta = 0; ack_diff_min = -1; for (i = 0; i < N_LINK_TCP_DATA; i++) { struct ack_data_record x; x = lnk->data.tcp->ack[i]; if (x.active == 1) { int ack_diff; ack_diff = SeqDiff(x.ack_new, ack); if (ack_diff >= 0) { if (ack_diff_min >= 0) { if (ack_diff < ack_diff_min) { delta = x.delta; ack_diff_min = ack_diff; } } else { delta = x.delta; ack_diff_min = ack_diff; } } } } return (delta); } int GetDeltaSeqOut(struct ip *pip, struct alias_link *lnk) { /* Find out how much the sequence number has been altered for an outgoing TCP packet. To do this, a circular list of ACK numbers where the TCP packet size was altered is searched. */ int i; struct tcphdr *tc; int delta, seq_diff_min; u_long seq; tc = ip_next(pip); seq = tc->th_seq; delta = 0; seq_diff_min = -1; for (i = 0; i < N_LINK_TCP_DATA; i++) { struct ack_data_record x; x = lnk->data.tcp->ack[i]; if (x.active == 1) { int seq_diff; seq_diff = SeqDiff(x.ack_old, seq); if (seq_diff >= 0) { if (seq_diff_min >= 0) { if (seq_diff < seq_diff_min) { delta = x.delta; seq_diff_min = seq_diff; } } else { delta = x.delta; seq_diff_min = seq_diff; } } } } return (delta); } void AddSeq(struct ip *pip, struct alias_link *lnk, int delta) { /* When a TCP packet has been altered in length, save this information in a circular list. If enough packets have been altered, then this list will begin to overwrite itself. */ struct tcphdr *tc; struct ack_data_record x; int hlen, tlen, dlen; int i; tc = ip_next(pip); hlen = (pip->ip_hl + tc->th_off) << 2; tlen = ntohs(pip->ip_len); dlen = tlen - hlen; x.ack_old = htonl(ntohl(tc->th_seq) + dlen); x.ack_new = htonl(ntohl(tc->th_seq) + dlen + delta); x.delta = delta; x.active = 1; i = lnk->data.tcp->state.index; lnk->data.tcp->ack[i] = x; i++; if (i == N_LINK_TCP_DATA) lnk->data.tcp->state.index = 0; else lnk->data.tcp->state.index = i; } void SetExpire(struct alias_link *lnk, int expire) { if (expire == 0) { lnk->flags &= ~LINK_PERMANENT; DeleteLink(lnk); } else if (expire == -1) { lnk->flags |= LINK_PERMANENT; } else if (expire > 0) { lnk->expire_time = expire; } else { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/SetExpire(): "); fprintf(stderr, "error in expire parameter\n"); #endif } } void ClearCheckNewLink(struct libalias *la) { la->newDefaultLink = 0; } void SetProtocolFlags(struct alias_link *lnk, int pflags) { lnk->pflags = pflags;; } int GetProtocolFlags(struct alias_link *lnk) { return (lnk->pflags); } void SetDestCallId(struct alias_link *lnk, u_int16_t cid) { struct libalias *la = lnk->la; la->deleteAllLinks = 1; lnk = ReLink(lnk, lnk->src_addr, lnk->dst_addr, lnk->alias_addr, lnk->src_port, cid, lnk->alias_port, lnk->link_type); la->deleteAllLinks = 0; } /* Miscellaneous Functions HouseKeeping() InitPacketAliasLog() UninitPacketAliasLog() */ /* Whenever an outgoing or incoming packet is handled, HouseKeeping() is called to find and remove timed-out aliasing links. Logic exists to sweep through the entire table and linked list structure every 60 seconds. (prototype in alias_local.h) */ void HouseKeeping(struct libalias *la) { int i, n, n100; #ifndef _KERNEL struct timeval tv; struct timezone tz; #endif /* * Save system time (seconds) in global variable timeStamp for use * by other functions. This is done so as not to unnecessarily * waste timeline by making system calls. */ #ifdef _KERNEL - la->timeStamp = time_second; + la->timeStamp = time_uptime; #else gettimeofday(&tv, &tz); la->timeStamp = tv.tv_sec; #endif /* Compute number of spokes (output table link chains) to cover */ n100 = LINK_TABLE_OUT_SIZE * 100 + la->houseKeepingResidual; n100 *= la->timeStamp - la->lastCleanupTime; n100 /= ALIAS_CLEANUP_INTERVAL_SECS; n = n100 / 100; /* Handle different cases */ if (n > ALIAS_CLEANUP_MAX_SPOKES) { n = ALIAS_CLEANUP_MAX_SPOKES; la->lastCleanupTime = la->timeStamp; la->houseKeepingResidual = 0; for (i = 0; i < n; i++) IncrementalCleanup(la); } else if (n > 0) { la->lastCleanupTime = la->timeStamp; la->houseKeepingResidual = n100 - 100 * n; for (i = 0; i < n; i++) IncrementalCleanup(la); } else if (n < 0) { #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAlias/HouseKeeping(): "); fprintf(stderr, "something unexpected in time values\n"); #endif la->lastCleanupTime = la->timeStamp; la->houseKeepingResidual = 0; } } #ifndef NO_LOGGING /* Init the log file and enable logging */ static void InitPacketAliasLog(struct libalias *la) { if ((~la->packetAliasMode & PKT_ALIAS_LOG) && (la->monitorFile = fopen("/var/log/alias.log", "w"))) { la->packetAliasMode |= PKT_ALIAS_LOG; fprintf(la->monitorFile, "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n"); } } /* Close the log-file and disable logging. */ static void UninitPacketAliasLog(struct libalias *la) { if (la->monitorFile) { fclose(la->monitorFile); la->monitorFile = NULL; } la->packetAliasMode &= ~PKT_ALIAS_LOG; } #endif /* Outside world interfaces -- "outside world" means other than alias*.c routines -- PacketAliasRedirectPort() PacketAliasAddServer() PacketAliasRedirectProto() PacketAliasRedirectAddr() PacketAliasRedirectDynamic() PacketAliasRedirectDelete() PacketAliasSetAddress() PacketAliasInit() PacketAliasUninit() PacketAliasSetMode() (prototypes in alias.h) */ /* Redirection from a specific public addr:port to a private addr:port */ struct alias_link * LibAliasRedirectPort(struct libalias *la, struct in_addr src_addr, u_short src_port, struct in_addr dst_addr, u_short dst_port, struct in_addr alias_addr, u_short alias_port, u_char proto) { int link_type; struct alias_link *lnk; switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; break; case IPPROTO_TCP: link_type = LINK_TCP; break; default: #ifdef LIBALIAS_DEBUG fprintf(stderr, "PacketAliasRedirectPort(): "); fprintf(stderr, "only TCP and UDP protocols allowed\n"); #endif return (NULL); } lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, dst_port, alias_port, link_type); if (lnk != NULL) { lnk->flags |= LINK_PERMANENT; } #ifdef LIBALIAS_DEBUG else { fprintf(stderr, "PacketAliasRedirectPort(): " "call to AddLink() failed\n"); } #endif return (lnk); } /* Add server to the pool of servers */ int LibAliasAddServer(struct libalias *la, struct alias_link *lnk, struct in_addr addr, u_short port) { struct server *server; (void)la; server = malloc(sizeof(struct server)); if (server != NULL) { struct server *head; server->addr = addr; server->port = port; head = lnk->server; if (head == NULL) server->next = server; else { struct server *s; for (s = head; s->next != head; s = s->next); s->next = server; server->next = head; } lnk->server = server; return (0); } else return (-1); } /* Redirect packets of a given IP protocol from a specific public address to a private address */ struct alias_link * LibAliasRedirectProto(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_char proto) { struct alias_link *lnk; lnk = AddLink(la, src_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); if (lnk != NULL) { lnk->flags |= LINK_PERMANENT; } #ifdef LIBALIAS_DEBUG else { fprintf(stderr, "PacketAliasRedirectProto(): " "call to AddLink() failed\n"); } #endif return (lnk); } /* Static address translation */ struct alias_link * LibAliasRedirectAddr(struct libalias *la, struct in_addr src_addr, struct in_addr alias_addr) { struct alias_link *lnk; lnk = AddLink(la, src_addr, la->nullAddress, alias_addr, 0, 0, 0, LINK_ADDR); if (lnk != NULL) { lnk->flags |= LINK_PERMANENT; } #ifdef LIBALIAS_DEBUG else { fprintf(stderr, "PacketAliasRedirectAddr(): " "call to AddLink() failed\n"); } #endif return (lnk); } /* Mark the aliasing link dynamic */ int LibAliasRedirectDynamic(struct libalias *la, struct alias_link *lnk) { (void)la; if (lnk->flags & LINK_PARTIALLY_SPECIFIED) return (-1); else { lnk->flags &= ~LINK_PERMANENT; return (0); } } void LibAliasRedirectDelete(struct libalias *la, struct alias_link *lnk) { /* This is a dangerous function to put in the API, because an invalid pointer can crash the program. */ la->deleteAllLinks = 1; DeleteLink(lnk); la->deleteAllLinks = 0; } void LibAliasSetAddress(struct libalias *la, struct in_addr addr) { if (la->packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE && la->aliasAddress.s_addr != addr.s_addr) CleanupAliasData(la); la->aliasAddress = addr; } void LibAliasSetTarget(struct libalias *la, struct in_addr target_addr) { la->targetAddress = target_addr; } static void finishoff(void) { while (!LIST_EMPTY(&instancehead)) LibAliasUninit(LIST_FIRST(&instancehead)); } struct libalias * LibAliasInit(struct libalias *la) { int i; #ifndef _KERNEL struct timeval tv; struct timezone tz; #endif if (la == NULL) { la = calloc(sizeof *la, 1); if (la == NULL) return (la); #ifndef _KERNEL /* kernel cleans up on module unload */ if (LIST_EMPTY(&instancehead)) atexit(finishoff); #endif LIST_INSERT_HEAD(&instancehead, la, instancelist); #ifdef _KERNEL - la->timeStamp = time_second; - la->lastCleanupTime = time_second; + la->timeStamp = time_uptime; + la->lastCleanupTime = time_uptime; #else gettimeofday(&tv, &tz); la->timeStamp = tv.tv_sec; la->lastCleanupTime = tv.tv_sec; #endif la->houseKeepingResidual = 0; for (i = 0; i < LINK_TABLE_OUT_SIZE; i++) LIST_INIT(&la->linkTableOut[i]); for (i = 0; i < LINK_TABLE_IN_SIZE; i++) LIST_INIT(&la->linkTableIn[i]); } else { la->deleteAllLinks = 1; CleanupAliasData(la); la->deleteAllLinks = 0; } la->aliasAddress.s_addr = INADDR_ANY; la->targetAddress.s_addr = INADDR_ANY; la->icmpLinkCount = 0; la->udpLinkCount = 0; la->tcpLinkCount = 0; la->pptpLinkCount = 0; la->protoLinkCount = 0; la->fragmentIdLinkCount = 0; la->fragmentPtrLinkCount = 0; la->sockCount = 0; la->cleanupIndex = 0; la->packetAliasMode = PKT_ALIAS_SAME_PORTS #ifndef NO_USE_SOCKETS | PKT_ALIAS_USE_SOCKETS #endif | PKT_ALIAS_RESET_ON_ADDR_CHANGE; #ifndef NO_FW_PUNCH la->fireWallFD = -1; #endif return (la); } void LibAliasUninit(struct libalias *la) { la->deleteAllLinks = 1; CleanupAliasData(la); la->deleteAllLinks = 0; #ifndef NO_LOGGING UninitPacketAliasLog(la); #endif #ifndef NO_FW_PUNCH UninitPunchFW(la); #endif LIST_REMOVE(la, instancelist); free(la); } /* Change mode for some operations */ unsigned int LibAliasSetMode( struct libalias *la, unsigned int flags, /* Which state to bring flags to */ unsigned int mask /* Mask of which flags to affect (use 0 to * do a probe for flag values) */ ) { #ifndef NO_LOGGING /* Enable logging? */ if (flags & mask & PKT_ALIAS_LOG) { InitPacketAliasLog(la); /* Do the enable */ } else /* _Disable_ logging? */ if (~flags & mask & PKT_ALIAS_LOG) { UninitPacketAliasLog(la); } #endif #ifndef NO_FW_PUNCH /* Start punching holes in the firewall? */ if (flags & mask & PKT_ALIAS_PUNCH_FW) { InitPunchFW(la); } else /* Stop punching holes in the firewall? */ if (~flags & mask & PKT_ALIAS_PUNCH_FW) { UninitPunchFW(la); } #endif /* Other flags can be set/cleared without special action */ la->packetAliasMode = (flags & mask) | (la->packetAliasMode & ~mask); return (la->packetAliasMode); } int LibAliasCheckNewLink(struct libalias *la) { return (la->newDefaultLink); } #ifndef NO_FW_PUNCH /***************** Code to support firewall punching. This shouldn't really be in this file, but making variables global is evil too. ****************/ /* Firewall include files */ #include #include #include #include /* * helper function, updates the pointer to cmd with the length * of the current command, and also cleans up the first word of * the new command in case it has been clobbered before. */ static ipfw_insn * next_cmd(ipfw_insn * cmd) { cmd += F_LEN(cmd); bzero(cmd, sizeof(*cmd)); return (cmd); } /* * A function to fill simple commands of size 1. * Existing flags are preserved. */ static ipfw_insn * fill_cmd(ipfw_insn * cmd, enum ipfw_opcodes opcode, int size, int flags, u_int16_t arg) { cmd->opcode = opcode; cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | (size & F_LEN_MASK); cmd->arg1 = arg; return next_cmd(cmd); } static ipfw_insn * fill_ip(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int32_t addr) { ipfw_insn_ip *cmd = (ipfw_insn_ip *) cmd1; cmd->addr.s_addr = addr; return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u32), 0, 0); } static ipfw_insn * fill_one_port(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int16_t port) { ipfw_insn_u16 *cmd = (ipfw_insn_u16 *) cmd1; cmd->ports[0] = cmd->ports[1] = port; return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u16), 0, 0); } static int fill_rule(void *buf, int bufsize, int rulenum, enum ipfw_opcodes action, int proto, struct in_addr sa, u_int16_t sp, struct in_addr da, u_int16_t dp) { struct ip_fw *rule = (struct ip_fw *)buf; ipfw_insn *cmd = (ipfw_insn *) rule->cmd; bzero(buf, bufsize); rule->rulenum = rulenum; cmd = fill_cmd(cmd, O_PROTO, F_INSN_SIZE(ipfw_insn), 0, proto); cmd = fill_ip(cmd, O_IP_SRC, sa.s_addr); cmd = fill_one_port(cmd, O_IP_SRCPORT, sp); cmd = fill_ip(cmd, O_IP_DST, da.s_addr); cmd = fill_one_port(cmd, O_IP_DSTPORT, dp); rule->act_ofs = (u_int32_t *) cmd - (u_int32_t *) rule->cmd; cmd = fill_cmd(cmd, action, F_INSN_SIZE(ipfw_insn), 0, 0); rule->cmd_len = (u_int32_t *) cmd - (u_int32_t *) rule->cmd; return ((char *)cmd - (char *)buf); } static void ClearAllFWHoles(struct libalias *la); #define fw_setfield(la, field, num) \ do { \ (field)[(num) - la->fireWallBaseNum] = 1; \ } /*lint -save -e717 */ while(0)/* lint -restore */ #define fw_clrfield(la, field, num) \ do { \ (field)[(num) - la->fireWallBaseNum] = 0; \ } /*lint -save -e717 */ while(0)/* lint -restore */ #define fw_tstfield(la, field, num) ((field)[(num) - la->fireWallBaseNum]) static void InitPunchFW(struct libalias *la) { la->fireWallField = malloc(la->fireWallNumNums); if (la->fireWallField) { memset(la->fireWallField, 0, la->fireWallNumNums); if (la->fireWallFD < 0) { la->fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); } ClearAllFWHoles(la); la->fireWallActiveNum = la->fireWallBaseNum; } } static void UninitPunchFW(struct libalias *la) { ClearAllFWHoles(la); if (la->fireWallFD >= 0) close(la->fireWallFD); la->fireWallFD = -1; if (la->fireWallField) free(la->fireWallField); la->fireWallField = NULL; la->packetAliasMode &= ~PKT_ALIAS_PUNCH_FW; } /* Make a certain link go through the firewall */ void PunchFWHole(struct alias_link *lnk) { struct libalias *la; int r; /* Result code */ struct ip_fw rule; /* On-the-fly built rule */ int fwhole; /* Where to punch hole */ la = lnk->la; /* Don't do anything unless we are asked to */ if (!(la->packetAliasMode & PKT_ALIAS_PUNCH_FW) || la->fireWallFD < 0 || lnk->link_type != LINK_TCP) return; memset(&rule, 0, sizeof rule); /** Build rule **/ /* Find empty slot */ for (fwhole = la->fireWallActiveNum; fwhole < la->fireWallBaseNum + la->fireWallNumNums && fw_tstfield(la, la->fireWallField, fwhole); fwhole++); if (fwhole == la->fireWallBaseNum + la->fireWallNumNums) { for (fwhole = la->fireWallBaseNum; fwhole < la->fireWallActiveNum && fw_tstfield(la, la->fireWallField, fwhole); fwhole++); if (fwhole == la->fireWallActiveNum) { /* No rule point empty - we can't punch more holes. */ la->fireWallActiveNum = la->fireWallBaseNum; #ifdef LIBALIAS_DEBUG fprintf(stderr, "libalias: Unable to create firewall hole!\n"); #endif return; } } /* Start next search at next position */ la->fireWallActiveNum = fwhole + 1; /* * generate two rules of the form * * add fwhole accept tcp from OAddr OPort to DAddr DPort add fwhole * accept tcp from DAddr DPort to OAddr OPort */ if (GetOriginalPort(lnk) != 0 && GetDestPort(lnk) != 0) { u_int32_t rulebuf[255]; int i; i = fill_rule(rulebuf, sizeof(rulebuf), fwhole, O_ACCEPT, IPPROTO_TCP, GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk)), GetDestAddress(lnk), ntohs(GetDestPort(lnk))); r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i); if (r) err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)"); i = fill_rule(rulebuf, sizeof(rulebuf), fwhole, O_ACCEPT, IPPROTO_TCP, GetDestAddress(lnk), ntohs(GetDestPort(lnk)), GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk))); r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i); if (r) err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)"); } /* Indicate hole applied */ lnk->data.tcp->fwhole = fwhole; fw_setfield(la, la->fireWallField, fwhole); } /* Remove a hole in a firewall associated with a particular alias lnk. Calling this too often is harmless. */ static void ClearFWHole(struct alias_link *lnk) { struct libalias *la; la = lnk->la; if (lnk->link_type == LINK_TCP) { int fwhole = lnk->data.tcp->fwhole; /* Where is the firewall * hole? */ struct ip_fw rule; if (fwhole < 0) return; memset(&rule, 0, sizeof rule); /* useless for ipfw2 */ while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL, &fwhole, sizeof fwhole)); fw_clrfield(la, la->fireWallField, fwhole); lnk->data.tcp->fwhole = -1; } } /* Clear out the entire range dedicated to firewall holes. */ static void ClearAllFWHoles(struct libalias *la) { struct ip_fw rule; /* On-the-fly built rule */ int i; if (la->fireWallFD < 0) return; memset(&rule, 0, sizeof rule); for (i = la->fireWallBaseNum; i < la->fireWallBaseNum + la->fireWallNumNums; i++) { int r = i; while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL, &r, sizeof r)); } /* XXX: third arg correct here ? /phk */ memset(la->fireWallField, 0, la->fireWallNumNums); } #endif void LibAliasSetFWBase(struct libalias *la, unsigned int base, unsigned int num) { #ifndef NO_FW_PUNCH la->fireWallBaseNum = base; la->fireWallNumNums = num; #endif } void LibAliasSetSkinnyPort(struct libalias *la, unsigned int port) { la->skinnyPort = port; }