Index: head/sys/netpfil/ipfw/ip_dn_io.c =================================================================== --- head/sys/netpfil/ipfw/ip_dn_io.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_dn_io.c (revision 345162) @@ -1,970 +1,970 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Dummynet portions related to packet handling. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include #include #include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ #include #include #include /* various ether_* routines */ #include /* for ip6_input, ip6_output prototypes */ #include #include #include #include #ifdef NEW_AQM #include #endif #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) * instead of dn_cfg.curr_time */ struct dn_parms dn_cfg; //VNET_DEFINE(struct dn_parms, _base_dn_cfg); static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ static long tick_delta_sum; /* Accumulated tick difference (usec).*/ static long tick_adjustment; /* Tick adjustments done. */ static long tick_lost; /* Lost(coalesced) ticks number. */ /* Adjusted vs non-adjusted curr_time difference (ticks). */ static long tick_diff; static unsigned long io_pkt; static unsigned long io_pkt_fast; #ifdef NEW_AQM unsigned long io_pkt_drop; #else static unsigned long io_pkt_drop; #endif /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events * are extracted. */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE /* * Because of the way the SYSBEGIN/SYSEND macros work on other * platforms, there should not be functions between them. * So keep the handlers outside the block. */ static int sysctl_hash_size(SYSCTL_HANDLER_ARGS) { int error, value; value = dn_cfg.hash_size; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value < 16 || value > 65536) return (EINVAL); dn_cfg.hash_size = value; return (0); } static int sysctl_limits(SYSCTL_HANDLER_ARGS) { int error; long value; if (arg2 != 0) value = dn_cfg.slot_limit; else value = dn_cfg.byte_limit; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (arg2 != 0) { if (value < 1) return (EINVAL); dn_cfg.slot_limit = value; } else { if (value < 1500) return (EINVAL); dn_cfg.byte_limit = value; } return (0); } SYSBEGIN(f4) SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); #ifdef NEW_AQM SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); #else static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); #endif /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) #define DC(x) (&(dn_cfg.x)) /* parameters */ SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, "I", "Default hash table size"); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, "L", "Upper limit in slots for pipe queue."); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, "L", "Upper limit in bytes for pipe queue."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); /* RED parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); /* time adjustment */ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, CTLFLAG_RD, &tick_diff, 0, "Adjusted vs non-adjusted curr_time difference (ticks)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); /* Drain parameters */ SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); /* statistics */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, CTLFLAG_RD, &io_pkt, 0, "Number of packets passed to dummynet."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, CTLFLAG_RD, &io_pkt_fast, 0, "Number of packets bypassed dummynet scheduler."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); #undef DC SYSEND #endif static void dummynet_send(struct mbuf *); /* * Return the mbuf tag holding the dummynet state (it should * be the first one on the list). */ struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); #ifdef NEW_AQM /* XXX: to skip ts m_tag. For Debugging only*/ if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) { m_tag_delete(m,mtag); mtag = m_tag_first(m); D("skip TS tag"); } #endif KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } #ifndef NEW_AQM static inline void mq_append(struct mq *q, struct mbuf *m) { #ifdef USERSPACE // buffers from netmap need to be copied // XXX note that the routine is not expected to fail ND("append %p to %p", m, q); if (m->m_flags & M_STACK) { struct mbuf *m_new; void *p; int l, ofs; ofs = m->m_data - m->__m_extbuf; // XXX allocate MGETHDR(m_new, M_NOWAIT, MT_DATA); ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", m, m->__m_extbuf, m->__m_extlen, ofs, m_new); p = m_new->__m_extbuf; /* new pointer */ l = m_new->__m_extlen; /* new len */ if (l <= m->__m_extlen) { panic("extlen too large"); } *m_new = *m; // copy m_new->m_flags &= ~M_STACK; m_new->__m_extbuf = p; // point to new buffer _pkt_copy(m->__m_extbuf, p, m->__m_extlen); m_new->m_data = p + ofs; m = m_new; } #endif /* USERSPACE */ if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->count++; q->tail = m; m->m_nextpkt = NULL; } #endif /* * Dispose a list of packet. Use a functions so if we need to do * more work, this is a central point to do it. */ void dn_free_pkts(struct mbuf *mnext) { struct mbuf *m; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; FREE_PKT(m); } } static int red_drops (struct dn_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. */ struct dn_fsk *fs = q->fs; int64_t p_b = 0; /* Queue in bytes or packets? */ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? q->ni.len_bytes : q->ni.length; /* Average queue size estimation. */ if (q_size != 0) { /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); q->avg += (int)v; } else { /* * Queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } /* Should i drop? */ if (q->avg < fs->min_th) { q->count = -1; return (0); /* accept packet */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->fs.flags & DN_IS_ECN) return (1); if (fs->fs.flags & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than * max_th the packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th * c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - fs->c_4; } else { q->count = -1; return (1); } } else if (q->avg > fs->min_th) { if (fs->fs.flags & DN_IS_ECN) return (1); /* * We compute p_b using the linear dropping function * p_b = c_1 * avg - c_2 * where c_1 = max_p / (max_th - min_th) * c_2 = max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; } if (fs->fs.flags & DN_QSIZE_BYTES) p_b = div64((p_b * len) , fs->max_pkt_size); if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { q->count = 0; /* After a drop we calculate a new random value. */ q->random = random() & 0xffff; return (1); /* drop */ } } /* End of RED algorithm. */ return (0); /* accept */ } /* * ECN/ECT Processing (partially adopted from altq) */ #ifndef NEW_AQM static #endif int ecn_mark(struct mbuf* m) { struct ip *ip; ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off); switch (ip->ip_v) { case IPVERSION: { uint16_t old; if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) return (0); /* not-ECT */ if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) return (1); /* already marked */ /* * ecn-capable but not marked, * mark CE and update checksum */ old = *(uint16_t *)ip; ip->ip_tos |= IPTOS_ECN_CE; ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); return (1); } #ifdef INET6 case (IPV6_VERSION >> 4): { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return (0); /* version mismatch! */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_NOTECT << 20)) return (0); /* not-ECT */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_CE << 20)) return (1); /* already marked */ /* * ecn-capable but not marked, mark CE */ flowlabel |= (IPTOS_ECN_CE << 20); ip6->ip6_flow = htonl(flowlabel); return (1); } #endif } return (0); } /* * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { struct dn_fs *f; struct dn_flow *ni; /* stats for scheduler instance */ uint64_t len; if (q->fs == NULL || q->_si == NULL) { printf("%s fs %p si %p, dropping\n", __FUNCTION__, q->fs, q->_si); FREE_PKT(m); return 1; } f = &(q->fs->fs); ni = &q->_si->ni; len = m->m_pkthdr.len; /* Update statistics, then check reasons to drop pkt. */ q->ni.tot_bytes += len; q->ni.tot_pkts++; ni->tot_bytes += len; ni->tot_pkts++; if (drop) goto drop; if (f->plr && random() < f->plr) goto drop; #ifdef NEW_AQM /* Call AQM enqueue function */ if (q->fs->aqmfp) return q->fs->aqmfp->enqueue(q ,m); #endif if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) goto drop; } if (f->flags & DN_QSIZE_BYTES) { if (q->ni.len_bytes > f->qsize) goto drop; } else if (q->ni.length >= f->qsize) { goto drop; } mq_append(&q->mq, m); q->ni.length++; q->ni.len_bytes += len; ni->length++; ni->len_bytes += len; return (0); drop: io_pkt_drop++; q->ni.drops++; ni->drops++; FREE_PKT(m); return (1); } /* * Fetch packets from the delay line which are due now. If there are * leftover packets, reinsert the delay line in the heap. * Runs under scheduler lock. */ static void transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) { struct mbuf *m; struct dn_pkt_tag *pkt = NULL; dline->oid.subtype = 0; /* not in heap */ while ((m = dline->mq.head) != NULL) { pkt = dn_tag_get(m); if (!DN_KEY_LEQ(pkt->output_time, now)) break; dline->mq.head = m->m_nextpkt; dline->mq.count--; mq_append(q, m); } if (m != NULL) { dline->oid.subtype = 1; /* in heap */ heap_insert(&dn_cfg.evheap, pkt->output_time, dline); } } /* * Convert the additional MAC overheads/delays into an equivalent * number of bits for the given data rate. The samples are * in milliseconds so we need to divide by 1000. */ static uint64_t extra_bits(struct mbuf *m, struct dn_schk *s) { int index; uint64_t bits; struct dn_profile *pf = s->profile; if (!pf || pf->samples_no == 0) return 0; index = random() % pf->samples_no; bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); if (index >= pf->loss_level) { struct dn_pkt_tag *dt = dn_tag_get(m); if (dt) dt->dn_dir = DIR_DROP; } return bits; } /* * Send traffic from a scheduler instance due by 'now'. * Return a pointer to the head of the queue. */ static struct mbuf * serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) { struct mq def_q; struct dn_schk *s = si->sched; struct mbuf *m = NULL; int delay_line_idle = (si->dline.mq.head == NULL); int done, bw; if (q == NULL) { q = &def_q; q->head = NULL; } bw = s->link.bandwidth; si->kflags &= ~DN_ACTIVE; if (bw > 0) si->credit += (now - si->sched_time) * bw; else si->credit = 0; si->sched_time = now; done = 0; while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { uint64_t len_scaled; done++; len_scaled = (bw == 0) ? 0 : hz * (m->m_pkthdr.len * 8 + extra_bits(m, s)); si->credit -= len_scaled; /* Move packet in the delay line */ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; mq_append(&si->dline.mq, m); } /* * If credit >= 0 the instance is idle, mark time. * Otherwise put back in the heap, and adjust the output * time of the last inserted packet, m, which was too early. */ if (si->credit >= 0) { si->idle_time = now; } else { uint64_t t; KASSERT (bw > 0, ("bw=0 and credit<0 ?")); t = div64(bw - 1 - si->credit, bw); if (m) dn_tag_get(m)->output_time += t; si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now + t, si); } if (delay_line_idle && done) transmit_event(q, &si->dline, now); return q->head; } /* * The timer handler for dummynet. Time is computed in ticks, but * but the code is tolerant to the actual rate at which this is called. * Once complete, the function reschedules itself for the next tick. */ void dummynet_task(void *context, int pending) { struct timeval t; struct mq q = { NULL, NULL }; /* queue to accumulate results */ CURVNET_SET((struct vnet *)context); DN_BH_WLOCK(); /* Update number of lost(coalesced) ticks. */ tick_lost += pending - 1; getmicrouptime(&t); /* Last tick duration (usec). */ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + (t.tv_usec - dn_cfg.prev_t.tv_usec); /* Last tick vs standard tick difference (usec). */ tick_delta = (tick_last * hz - 1000000) / hz; /* Accumulated tick difference (usec). */ tick_delta_sum += tick_delta; dn_cfg.prev_t = t; /* * Adjust curr_time if the accumulated tick difference is * greater than the 'standard' tick. Since curr_time should * be monotonically increasing, we do positive adjustments * as required, and throttle curr_time in case of negative * adjustment. */ dn_cfg.curr_time++; if (tick_delta_sum - tick >= 0) { int diff = tick_delta_sum / tick; dn_cfg.curr_time += diff; tick_diff += diff; tick_delta_sum %= tick; tick_adjustment++; } else if (tick_delta_sum + tick <= 0) { dn_cfg.curr_time--; tick_diff--; tick_delta_sum += tick; tick_adjustment++; } /* serve pending events, accumulate in q */ for (;;) { struct dn_id *p; /* generic parameter to handler */ if (dn_cfg.evheap.elements == 0 || DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) break; p = HEAP_TOP(&dn_cfg.evheap)->object; heap_extract(&dn_cfg.evheap, NULL); if (p->type == DN_SCH_I) { serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); } else { /* extracted a delay line */ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); } } if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { dn_cfg.expire_cycle = 0; dn_drain_scheduler(); dn_drain_queue(); } dn_reschedule(); DN_BH_WUNLOCK(); if (q.head != NULL) dummynet_send(q.head); CURVNET_RESTORE(); } /* * forward a chain of packets to the proper destination. * This runs outside the dummynet lock. */ static void dummynet_send(struct mbuf *m) { struct mbuf *n; for (; m != NULL; m = n) { struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ struct m_tag *tag; int dst; n = m->m_nextpkt; m->m_nextpkt = NULL; tag = m_tag_first(m); if (tag == NULL) { /* should not happen */ dst = DIR_DROP; } else { struct dn_pkt_tag *pkt = dn_tag_get(m); /* extract the dummynet info, rename the tag * to carry reinject info. */ if (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2) && pkt->ifp == NULL) { dst = DIR_DROP; } else { dst = pkt->dn_dir; ifp = pkt->ifp; tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; } } switch (dst) { case DIR_OUT: ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); break ; case DIR_IN : netisr_dispatch(NETISR_IP, m); break; #ifdef INET6 case DIR_IN | PROTO_IPV6: netisr_dispatch(NETISR_IPV6, m); break; case DIR_OUT | PROTO_IPV6: ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); break; #endif case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. * Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup failed, " "dropping packet\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); break; case DIR_OUT | PROTO_LAYER2: /* DN_TO_ETH_OUT: */ ether_output_frame(ifp, m); break; case DIR_DROP: /* drop the packet after some time */ FREE_PKT(m); break; default: printf("dummynet: bad switch %d!\n", dst); FREE_PKT(m); break; } } } static inline int tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct dn_pkt_tag *dt; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*dt), M_NOWAIT | M_ZERO); if (mtag == NULL) return 1; /* Cannot allocate packet header. */ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ dt = (struct dn_pkt_tag *)(mtag + 1); dt->rule = fwa->rule; dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ dt->dn_dir = dir; - dt->ifp = fwa->oif; + dt->ifp = fwa->flags & IPFW_ARGS_OUT ? fwa->ifp : NULL; /* dt->output tame is updated as we move through */ dt->output_time = dn_cfg.curr_time; dt->iphdr_off = (dir & PROTO_LAYER2) ? ETHER_HDR_LEN : 0; return 0; } /* * dummynet hook for packets. * We use the argument to locate the flowset fs and the sched_set sch * associated to it. The we apply flow_mask and sched_mask to * determine the queue and scheduler instances. * * dir where shall we send the packet after dummynet. * *m0 the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, */ int dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) { struct mbuf *m = *m0; struct dn_fsk *fs = NULL; struct dn_sch_inst *si; struct dn_queue *q = NULL; /* default */ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); DN_BH_WLOCK(); io_pkt++; /* we could actually tag outside the lock, but who cares... */ if (tag_mbuf(m, dir, fwa)) goto dropit; if (dn_cfg.busy) { /* if the upper half is busy doing something expensive, * lets queue the packet and move forward */ mq_append(&dn_cfg.pending, m); m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* XXX locate_flowset could be optimised with a direct ref. */ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); if (fs == NULL) goto dropit; /* This queue/pipe does not exist! */ if (fs->sched == NULL) /* should not happen */ goto dropit; /* find scheduler instance, possibly applying sched_mask */ si = ipdn_si_find(fs->sched, &(fwa->f_id)); if (si == NULL) goto dropit; /* * If the scheduler supports multiple queues, find the right one * (otherwise it will be ignored by enqueue). */ if (fs->sched->fp->flags & DN_MULTIQUEUE) { q = ipdn_q_find(fs, si, &(fwa->f_id)); if (q == NULL) goto dropit; } if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; /* dn_enqueue already increases io_pkt_drop */ io_pkt_drop--; goto dropit; } if (si->kflags & DN_ACTIVE) { m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* compute the initial allowance */ if (si->idle_time < dn_cfg.curr_time) { /* Do this only on the first packet on an idle pipe */ struct dn_link *p = &fs->sched->link; si->sched_time = dn_cfg.curr_time; si->credit = dn_cfg.io_fast ? p->bandwidth : 0; if (p->burst) { uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; if (burst > p->burst) burst = p->burst; si->credit += burst; } } /* pass through scheduler and delay line */ m = serve_sched(NULL, si, dn_cfg.curr_time); /* optimization -- pass it back to ipfw for immediate send */ /* XXX Don't call dummynet_send() if scheduler return the packet * just enqueued. This avoid a lock order reversal. * */ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { /* fast io, rename the tag * to carry reinject info. */ struct m_tag *tag = m_tag_first(m); tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; io_pkt_fast++; if (m->m_nextpkt != NULL) { printf("dummynet: fast io: pkt chain detected!\n"); m->m_nextpkt = NULL; } m = NULL; } else { *m0 = NULL; } done: DN_BH_WUNLOCK(); if (m) dummynet_send(m); return 0; dropit: io_pkt_drop++; DN_BH_WUNLOCK(); if (m) FREE_PKT(m); *m0 = NULL; return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; } Index: head/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw2.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw2.c (revision 345162) @@ -1,3453 +1,3446 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include /* for struct grehdr */ #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; #ifndef LINEAR_SKIPTO static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) #else static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) #endif /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct nhop4_basic nh4; if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0) return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ifp != nh4.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; #endif /* __FreeBSD__ */ } /* * Generate an SCTP packet containing an ABORT chunk. The verification tag * is given by vtag. The T-bit is set in the ABORT chunk if and only if * reflected is not 0. */ static struct mbuf * ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag, int reflected) { struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct sctphdr *sctp; struct sctp_chunkhdr *chunk; u_int16_t hlen, plen, tlen; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: hlen = sizeof(struct ip); break; #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); tlen = hlen + plen; m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = tlen; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, tlen); switch (id->addr_type) { case 4: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(tlen); ip->ip_id = htons(0); ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_SCTP; ip->ip_sum = 0; ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); sctp = (struct sctphdr *)(ip + 1); break; #ifdef INET6 case 6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(plen); ip6->ip6_nxt = IPPROTO_SCTP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = id->dst_ip6; ip6->ip6_dst = id->src_ip6; sctp = (struct sctphdr *)(ip6 + 1); break; #endif } sctp->src_port = htons(id->dst_port); sctp->dest_port = htons(id->src_port); sctp->v_tag = htonl(vtag); sctp->checksum = htonl(0); chunk = (struct sctp_chunkhdr *)(sctp + 1); chunk->chunk_type = SCTP_ABORT_ASSOCIATION; chunk->chunk_flags = 0; if (reflected != 0) { chunk->chunk_flags |= SCTP_HAD_NO_TCB; } chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr)); sctp->checksum = sctp_calculate_cksum(m, hlen); return (m); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; int len, dir; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static const struct in6_addr lla_mask = {{{ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}; static int ipfw_localip6(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_MULTICAST(in6)) return (0); if (!IN6_IS_ADDR_LINKLOCAL(in6)) return (in6_localip(in6)); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, in6, &lla_mask)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct nhop6_basic nh6; if (IN6_IS_SCOPE_LINKLOCAL(src)) return (1); if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0) return (0); /* If ifp is provided, check for equality with route table. */ if (ifp != NULL && ifp != nh6.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static int map_icmp_unreach(int code) { /* RFC 7915 p4.2 */ switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: return (ICMP6_DST_UNREACH_NOROUTE); case ICMP_UNREACH_PORT: return (ICMP6_DST_UNREACH_NOPORT); default: /* * Map the rest of codes into admit prohibited. * XXX: unreach proto should be mapped into ICMPv6 * parameter problem, but we use only unreach type. */ return (ICMP6_DST_UNREACH_ADMIN); } } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code == ICMP6_UNREACH_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m0; struct sctphdr *sctp; u_int32_t v_tag; int reflected; sctp = (struct sctphdr *)((char *)ip6 + hlen); reflected = 1; v_tag = ntohl(sctp->v_tag); /* Investigate the first chunk header if available */ if (m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { struct sctp_chunkhdr *chunk; chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (m->m_pkthdr.len > hlen + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { struct sctp_init *init; init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m0 = NULL; } else { m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else if (code == ICMP_REJECT_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m; struct sctphdr *sctp; struct sctp_chunkhdr *chunk; struct sctp_init *init; u_int32_t v_tag; int reflected; sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *)); reflected = 1; v_tag = ntohl(sctp->v_tag); if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { /* Look at the first chunk header if available */ chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (iplen > (ip->ip_hl << 2) + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m = NULL; } else { m = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; - struct ifnet *oif; int lookupflags; int match; id = &args->f_id; inp = args->inp; - oif = args->oif; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else if (id->proto == IPPROTO_UDPLITE) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_ulitecbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 - if (oif == NULL) + if (args->flags & IPFW_ARGS_IN) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), - lookupflags, oif, args->m); + lookupflags, NULL, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), - lookupflags, oif, args->m); + lookupflags, args->ifp, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); - if (oif == NULL) + if (args->flags & IPFW_ARGS_IN) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), - lookupflags, oif, args->m); + lookupflags, NULL, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), - lookupflags, oif, args->m); + lookupflags, args->ifp, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; args->flags |= IPFW_ARGS_REF; } #ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->cached_pos), * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TARG && f->cached_id == chain->id) f_pos = f->cached_pos; else { int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; if (chain->idxmap != NULL) f_pos = chain->idxmap[i]; else f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TARG) { f->cached_id = chain->id; f->cached_pos = f_pos; } } return (f_pos); } #else /* * Helper function to enable real fast rule lookups. */ static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; num = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && num <= f->rulenum) num = f->rulenum + 1; f_pos = chain->idxmap[num]; return (f_pos); } #endif #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** - * args->oif Outgoing interface, NULL if packet is incoming. - * The incoming interface is in the mbuf. (in) + * args->ifp Incoming or outgoing interface. * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; - - /* - * oif | args->oif If NULL, ipfw_chk has been called on the - * inbound path (ether_input, ip_input). - * If non-NULL, ipfw_chk has been called on the outbound path - * (ether_output, ip_output). - */ - struct ifnet *oif = args->oif; - int f_pos = 0; /* index of current rule in the array */ int retval = 0; + struct ifnet *oif, *iif; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port, dst_port; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ int iplen = 0; int pktlen; uint16_t etype; /* Host order stored ether type */ struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ src_port = dst_port = 0; pktlen = m->m_pkthdr.len; DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define PULLUP_LEN(_len, p, T) \ do { \ int x = (_len) + T; \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (_len)); \ } while (0) /* * if we have an ether header, */ if (args->flags & IPFW_ARGS_ETHER) etype = ntohs(args->eh->ether_type); else etype = 0; /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (etype == 0 || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_GRE: /* RFC 1701 */ /* XXX GRE header check? */ PULLUP_TO(hlen, ulp, struct grehdr); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, offsetof( struct carp_header, carp_counter)); if (CARP_ADVERTISEMENT != ((struct carp_header *)ulp)->carp_type) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.addr_type = 6; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6); } else if (pktlen >= sizeof(struct ip) && (etype == 0 || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster * matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.addr_type = 4; args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } else { proto = 0; dst_ip.s_addr = src_ip.s_addr = 0; args->f_id.addr_type = 1; /* XXX */ } #undef PULLUP_TO pktlen = iplen < pktlen ? iplen: pktlen; /* Properly initialize the rest of f_id */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); args->f_id.fib = M_GETFIB(m); IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->flags & IPFW_ARGS_REF) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } + if (args->flags & IPFW_ARGS_IN) { + iif = args->ifp; + oif = NULL; + } else { + MPASS(args->flags & IPFW_ARGS_OUT); + iif = m->m_pkthdr.rcvif; + oif = args->ifp; + } + /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: - match = iface_match(m->m_pkthdr.rcvif, - (ipfw_insn_if *)cmd, chain, &tablearg); + match = iface_match(iif, (ipfw_insn_if *)cmd, + chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: - match = iface_match(oif ? oif : - m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, - chain, &tablearg); + match = iface_match(args->ifp, + (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->flags & IPFW_ARGS_ETHER) { u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->flags & IPFW_ARGS_ETHER) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->flags & IPFW_ARGS_ETHER); break; case O_DIVERTED: if ((args->flags & IPFW_ARGS_REF) == 0) break; /* * For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ match = ((args->rule.info & IPFW_IS_MASK) == IPFW_IS_DIVERT) && ( ((args->rule.info & IPFW_INFO_IN) ? 1: 2) & cmd->arg1); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_DST_LOOKUP: { void *pkey; uint32_t vidx, key; uint16_t keylen; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* Determine lookup key type */ vidx = ((ipfw_insn_u32 *)cmd)->d[1]; if (vidx != 4 /* uid */ && vidx != 5 /* jail */ && is_ipv6 == 0 && is_ipv4 == 0) break; /* Determine key length */ if (vidx == 0 /* dst-ip */ || vidx == 1 /* src-ip */) keylen = is_ipv6 ? sizeof(struct in6_addr): sizeof(in_addr_t); else { keylen = sizeof(key); pkey = &key; } if (vidx == 0 /* dst-ip */) pkey = is_ipv4 ? (void *)&dst_ip: (void *)&args->f_id.dst_ip6; else if (vidx == 1 /* src-ip */) pkey = is_ipv4 ? (void *)&src_ip: (void *)&args->f_id.src_ip6; else if (vidx == 6 /* dscp */) { if (is_ipv4) key = ip->ip_tos >> 2; else { key = args->f_id.flow_id6; key = (key & 0x0f) << 2 | (key & 0xf000) >> 14; } key &= 0x3f; } else if (vidx == 2 /* dst-port */ || vidx == 3 /* src-port */) { /* Skip fragments */ if (offset != 0) break; /* Skip proto without ports */ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && proto != IPPROTO_SCTP) break; if (vidx == 2 /* dst-port */) key = dst_port; else key = src_port; } #ifndef USERSPACE else if (vidx == 4 /* uid */ || vidx == 5 /* jail */) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache->cr_uid; else if (vidx == 5 /* jail */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache.uid; else if (vidx == 5 /* jail */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ } #endif /* !USERSPACE */ else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; tablearg = vidx; break; } /* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */ /* FALLTHROUGH */ } case O_IP_SRC_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen; if (is_ipv4) { keylen = sizeof(in_addr_t); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &dst_ip; else pkey = &src_ip; } else if (is_ipv6) { keylen = sizeof(struct in6_addr); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &args->f_id.dst_ip6; else pkey = &args->f_id.src_ip6; } else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, vidx, tag); if (!match) break; } tablearg = vidx; break; } case O_IP_FLOW_LOOKUP: { uint32_t v = 0; match = ipfw_lookup_table(chain, cmd->arg1, 0, &args->f_id, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, v, tag); if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { match = in_localip(src_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { match = in_localip(dst_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_TCP || proto == IPPROTO_SCTP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; #ifdef INET6 if (is_ipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip; if (ip6->ip6_plen == 0) { /* * Jumbo payload is not * supported by this * opcode. */ break; } x = iplen - hlen; } else #endif /* INET6 */ x = iplen - (ip->ip_hl << 2); tcp = TCP(ulp); x -= tcp->th_off << 2; if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, m, - oif, offset | ip6f_mf, tablearg, ip); + offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ - match = ((oif != NULL) || - (m->m_pkthdr.rcvif == NULL) || + match = (args->flags & IPFW_ARGS_OUT || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), - m->m_pkthdr.rcvif, args->f_id.fib) : + iif, args->f_id.fib) : #endif - verify_path(src_ip, m->m_pkthdr.rcvif, - args->f_id.fib))); + verify_path(src_ip, iif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib)))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( - &(args->f_id.src_ip6), - m->m_pkthdr.rcvif, + &(args->f_id.src_ip6), iif, args->f_id.fib) : #endif - verify_path(src_ip, - m->m_pkthdr.rcvif, + verify_path(src_ip, iif, args->f_id.fib); else match = 1; break; case O_IPSEC: match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else if (proto == IPPROTO_UDPLITE) pi = &V_ulitecbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incoming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule (one * exception is O_SKIP_ACTION which could be * between these opcodes and 'action' one). * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. * * O_SKIP_ACTION: this opcode is not a real 'action' * either, and is stored right before the 'action' * part of the rule, right after the O_KEEP_STATE * opcode. It causes match failure so the real * 'action' could be executed only if the rule * is checked via dynamic rule from the state * table, as in such case execution starts * from the true 'action' opcode directly. * */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, (ipfw_insn_limit *)cmd, args, ulp, pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && (q = ipfw_dyn_lookup_state(args, ulp, pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule * by setting f, cmd, l and clearing * cmdlen. */ f = q; f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_SKIP_ACTION: match = 0; /* skip to the next rule */ l = 0; /* exit inner loop */ break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->flags & IPFW_ARGS_ETHER) break; /* not on layer 2 */ /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST( &args->f_id.dst_ip6)) { send_reject6(args, cmd->opcode == O_REJECT ? map_icmp_unreach(cmd->arg1): cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { #ifdef INET6 /* * We use O_FORWARD_IP opcode for * fwd rule with tablearg, but tables * now support IPv6 addresses. And * when we are inspecting IPv6 packet, * we can use nh6 field from * table_value as next_hop6 address. */ if (is_ipv6) { struct ip_fw_nh6 *nh6; args->flags |= IPFW_ARGS_NH6; nh6 = &args->hopstore6; nh6->sin6_addr = TARG_VAL( chain, tablearg, nh6); nh6->sin6_port = sa->sin_port; nh6->sin6_scope_id = TARG_VAL( chain, tablearg, zoneid); } else #endif { args->flags |= IPFW_ARGS_NH4; args->hopstore.sin_port = sa->sin_port; sa = &args->hopstore; sa->sin_family = AF_INET; sa->sin_len = sizeof(*sa); sa->sin_addr.s_addr = htonl( TARG_VAL(chain, tablearg, nh4)); } } else { args->flags |= IPFW_ARGS_NH4PTR; args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->flags |= IPFW_ARGS_NH6PTR; args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; /* XXX */ l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t old; old = *(uint16_t *)ip; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ /* * Ensure that we do not invoke NAT handler for * non IPv4 packets. Libalias expects only IPv4. */ if (!is_ipv4 || !IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; args->rule.info = 0; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; l = 0; /* in any case exit inner loop */ if (is_ipv6) /* IPv6 is not supported yet */ break; IPFW_INC_RULE_COUNTER(f, pktlen); ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; args->rule.info = 0; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } case O_EXTERNAL_ACTION: l = 0; /* in any case exit inner loop */ retval = ipfw_run_eaction(chain, args, cmd, &done); /* * If both @retval and @done are zero, * consider this as rule matching and * update counters. */ if (retval == 0 && done == 0) { IPFW_INC_RULE_COUNTER(f, pktlen); /* * Reset the result of the last * dynamic state lookup. * External action can change * @args content, and it may be * used for new state lookup later. */ DYN_INFO_INIT(&dyn_info); } break; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_destroy_sopt_handler(); ipfw_destroy_obj_rewriter(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* Init shared services hash table */ ipfw_init_srv(chain); ipfw_init_counters(); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } IPFW_LOCK_INIT(chain); /* fill and insert the default rule */ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); rule->cmd_len = 1; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = rule; ipfw_add_protected_rule(chain, rule, 0); ipfw_dyn_init(chain); ipfw_eaction_init(chain, first); #ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); #endif ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(1); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_UH_WLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); #ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); #endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); ipfw_bpf_uninit(last); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: head/sys/netpfil/ipfw/ip_fw_dynamic.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_dynamic.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw_dynamic.c (revision 345162) @@ -1,3268 +1,3265 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 Yandex LLC * Copyright (c) 2017-2018 Andrey V. Elsukov * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IN6_ARE_ADDR_EQUAL */ #ifdef INET6 #include #include #include #endif #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * Description of dynamic states. * * Dynamic states are stored in lists accessed through a hash tables * whose size is curr_dyn_buckets. This value can be modified through * the sysctl variable dyn_buckets. * * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, * and dyn_ipv6_parent. * * When a packet is received, its address fields hashed, then matched * against the entries in the corresponding list by addr_type. * Dynamic states can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic states is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic states is equal to UMA zone items count. * The max number of dynamic states is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each state holds a pointer to the parent ipfw rule so we know what * action to perform. Dynamic rules are removed when the parent rule is * deleted. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ /* By default use jenkins hash function */ #define IPFIREWALL_JENKINSHASH #define DYN_COUNTER_INC(d, dir, pktlen) do { \ (d)->pcnt_ ## dir++; \ (d)->bcnt_ ## dir += pktlen; \ } while (0) #define DYN_REFERENCED 0x01 /* * DYN_REFERENCED flag is used to show that state keeps reference to named * object, and this reference should be released when state becomes expired. */ struct dyn_data { void *parent; /* pointer to parent rule */ uint32_t chain_id; /* cached ruleset id */ uint32_t f_pos; /* cached rule index */ uint32_t hashval; /* hash value used for hash resize */ uint16_t fibnum; /* fib used to send keepalives */ uint8_t _pad[3]; uint8_t flags; /* internal flags */ uint16_t rulenum; /* parent rule number */ uint32_t ruleid; /* parent rule id */ uint32_t state; /* TCP session state and flags */ uint32_t ack_fwd; /* most recent ACKs in forward */ uint32_t ack_rev; /* and reverse direction (used */ /* to generate keepalives) */ uint32_t sync; /* synchronization time */ uint32_t expire; /* expire time */ uint64_t pcnt_fwd; /* bytes counter in forward */ uint64_t bcnt_fwd; /* packets counter in forward */ uint64_t pcnt_rev; /* bytes counter in reverse */ uint64_t bcnt_rev; /* packets counter in reverse */ }; #define DPARENT_COUNT_DEC(p) do { \ MPASS(p->count > 0); \ ck_pr_dec_32(&(p)->count); \ } while (0) #define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) #define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) struct dyn_parent { void *parent; /* pointer to parent rule */ uint32_t count; /* number of linked states */ uint8_t _pad[2]; uint16_t rulenum; /* parent rule number */ uint32_t ruleid; /* parent rule id */ uint32_t hashval; /* hash value used for hash resize */ uint32_t expire; /* expire time */ }; struct dyn_ipv4_state { uint8_t type; /* State type */ uint8_t proto; /* UL Protocol */ uint16_t kidx; /* named object index */ uint16_t sport, dport; /* ULP source and destination ports */ in_addr_t src, dst; /* IPv4 source and destination */ union { struct dyn_data *data; struct dyn_parent *limit; }; CK_SLIST_ENTRY(dyn_ipv4_state) entry; SLIST_ENTRY(dyn_ipv4_state) expired; }; CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4); VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); VNET_DEFINE_STATIC(struct dyn_ipv4_slist, dyn_expired_ipv4); #define V_dyn_ipv4 VNET(dyn_ipv4) #define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) #define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) #ifdef INET6 struct dyn_ipv6_state { uint8_t type; /* State type */ uint8_t proto; /* UL Protocol */ uint16_t kidx; /* named object index */ uint16_t sport, dport; /* ULP source and destination ports */ struct in6_addr src, dst; /* IPv6 source and destination */ uint32_t zoneid; /* IPv6 scope zone id */ union { struct dyn_data *data; struct dyn_parent *limit; }; CK_SLIST_ENTRY(dyn_ipv6_state) entry; SLIST_ENTRY(dyn_ipv6_state) expired; }; CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6); VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); VNET_DEFINE_STATIC(struct dyn_ipv6_slist, dyn_expired_ipv6); #define V_dyn_ipv6 VNET(dyn_ipv6) #define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) #define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) #endif /* INET6 */ /* * Per-CPU pointer indicates that specified state is currently in use * and must not be reclaimed by expiration callout. */ static void **dyn_hp_cache; DPCPU_DEFINE_STATIC(void *, dyn_hp); #define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) #define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) #define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) #define DYNSTATE_CRITICAL_ENTER() critical_enter() #define DYNSTATE_CRITICAL_EXIT() do { \ DYNSTATE_RELEASE(); \ critical_exit(); \ } while (0); /* * We keep two version numbers, one is updated when new entry added to * the list. Second is updated when an entry deleted from the list. * Versions are updated under bucket lock. * * Bucket "add" version number is used to know, that in the time between * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did * not install some state in this bucket. Using this info we can avoid * additional state lookup, because we are sure that we will not install * the state twice. * * Also doing the tracking of bucket "del" version during lookup we can * be sure, that state entry was not unlinked and freed in time between * we read the state pointer and protect it with hazard pointer. * * An entry unlinked from CK list keeps unchanged until it is freed. * Unlinked entries are linked into expired lists using "expired" field. */ /* * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. * dyn_bucket_lock is used to get write access to lists in specific bucket. * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, * and ipv6_parent lists. */ VNET_DEFINE_STATIC(struct mtx, dyn_expire_lock); VNET_DEFINE_STATIC(struct mtx *, dyn_bucket_lock); #define V_dyn_expire_lock VNET(dyn_expire_lock) #define V_dyn_bucket_lock VNET(dyn_bucket_lock) /* * Bucket's add/delete generation versions. */ VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_del); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_del); #define V_dyn_ipv4_add VNET(dyn_ipv4_add) #define V_dyn_ipv4_del VNET(dyn_ipv4_del) #define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) #define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) #ifdef INET6 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_del); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_del); #define V_dyn_ipv6_add VNET(dyn_ipv6_add) #define V_dyn_ipv6_del VNET(dyn_ipv6_del) #define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) #define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) #endif /* INET6 */ #define DYN_BUCKET(h, b) ((h) & (b - 1)) #define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) #define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) #define DYN_BUCKET_LOCK_INIT(lock, b) \ mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) #define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) #define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) #define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) #define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) #define DYN_EXPIRED_LOCK_INIT() \ mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) #define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) #define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) #define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) VNET_DEFINE_STATIC(uint32_t, dyn_buckets_max); VNET_DEFINE_STATIC(uint32_t, curr_dyn_buckets); VNET_DEFINE_STATIC(struct callout, dyn_timeout); #define V_dyn_buckets_max VNET(dyn_buckets_max) #define V_curr_dyn_buckets VNET(curr_dyn_buckets) #define V_dyn_timeout VNET(dyn_timeout) /* Maximum length of states chain in a bucket */ VNET_DEFINE_STATIC(uint32_t, curr_max_length); #define V_curr_max_length VNET(curr_max_length) VNET_DEFINE_STATIC(uint32_t, dyn_keep_states); #define V_dyn_keep_states VNET(dyn_keep_states) VNET_DEFINE_STATIC(uma_zone_t, dyn_data_zone); VNET_DEFINE_STATIC(uma_zone_t, dyn_parent_zone); VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv4_zone); #ifdef INET6 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv6_zone); #define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) #endif /* INET6 */ #define V_dyn_data_zone VNET(dyn_data_zone) #define V_dyn_parent_zone VNET(dyn_parent_zone) #define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) /* * Timeouts for various events in handing dynamic rules. */ VNET_DEFINE_STATIC(uint32_t, dyn_ack_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_syn_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_fin_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_rst_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_udp_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) #define V_dyn_short_lifetime VNET(dyn_short_lifetime) /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_interval); VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_period); VNET_DEFINE_STATIC(uint32_t, dyn_keepalive); VNET_DEFINE_STATIC(time_t, dyn_keepalive_last); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) #define V_dyn_keepalive_period VNET(dyn_keepalive_period) #define V_dyn_keepalive VNET(dyn_keepalive) #define V_dyn_keepalive_last VNET(dyn_keepalive_last) VNET_DEFINE_STATIC(uint32_t, dyn_max); /* max # of dynamic states */ VNET_DEFINE_STATIC(uint32_t, dyn_count); /* number of states */ VNET_DEFINE_STATIC(uint32_t, dyn_parent_max); /* max # of parent states */ VNET_DEFINE_STATIC(uint32_t, dyn_parent_count); /* number of parent states */ #define V_dyn_max VNET(dyn_max) #define V_dyn_count VNET(dyn_count) #define V_dyn_parent_max VNET(dyn_parent_max) #define V_dyn_parent_count VNET(dyn_parent_count) #define DYN_COUNT_DEC(name) do { \ MPASS((V_ ## name) > 0); \ ck_pr_dec_32(&(V_ ## name)); \ } while (0) #define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) #define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) static time_t last_log; /* Log ratelimiting */ /* * Get/set maximum number of dynamic states in given VNET instance. */ static int sysctl_dyn_max(SYSCTL_HANDLER_ARGS) { uint32_t nstates; int error; nstates = V_dyn_max; error = sysctl_handle_32(oidp, &nstates, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); V_dyn_max = nstates; uma_zone_set_max(V_dyn_data_zone, V_dyn_max); return (0); } static int sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) { uint32_t nstates; int error; nstates = V_dyn_parent_max; error = sysctl_handle_32(oidp, &nstates, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); V_dyn_parent_max = nstates; uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); return (0); } static int sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) { uint32_t nbuckets; int error; nbuckets = V_dyn_buckets_max; error = sysctl_handle_32(oidp, &nbuckets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); if (nbuckets > 256) V_dyn_buckets_max = 1 << fls(nbuckets - 1); else return (EINVAL); return (0); } SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, "Current number of dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, "Current number of parent states. "); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, "Current number of buckets for states hash table."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, "Current maximum length of states chains in hash buckets."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_buckets, "IU", "Max number of buckets for dynamic states hash table."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_max, "IU", "Max number of dynamic states."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_parent_max, "IU", "Max number of parent dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, "Lifetime of dynamic states for TCP ACK."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, "Lifetime of dynamic states for TCP SYN."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, "Lifetime of dynamic states for TCP FIN."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, "Lifetime of dynamic states for TCP RST."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, "Lifetime of dynamic states for UDP."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, "Lifetime of dynamic states for other situations."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, "Do not flush dynamic states on rule deletion"); #ifdef IPFIREWALL_DYNDEBUG #define DYN_DEBUG(fmt, ...) do { \ printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ } while (0) #else #define DYN_DEBUG(fmt, ...) #endif /* !IPFIREWALL_DYNDEBUG */ #ifdef INET6 /* Functions to work with IPv6 states */ static struct dyn_ipv6_state *dyn_lookup_ipv6_state( const struct ipfw_flow_id *, uint32_t, const void *, struct ipfw_dyn_info *, int); static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, uint16_t); static struct dyn_ipv6_state *dyn_alloc_ipv6_state( const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t); static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, ipfw_dyn_rule *); static uint32_t dyn_getscopeid(const struct ip_fw_args *); static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, uint16_t); static void dyn_enqueue_keepalive_ipv6(struct mbufq *, const struct dyn_ipv6_state *); static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, uint32_t); static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, uint32_t); static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, uint16_t); #endif /* INET6 */ /* Functions to work with limit states */ static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, struct ip_fw *, uint32_t, uint32_t, uint16_t); static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t, uint32_t); static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t); static void dyn_tick(void *); static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); static void dyn_free_states(struct ip_fw_chain *); static void dyn_export_parent(const struct dyn_parent *, uint16_t, uint8_t, ipfw_dyn_rule *); static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t, uint8_t, ipfw_dyn_rule *); static uint32_t dyn_update_tcp_state(struct dyn_data *, const struct ipfw_flow_id *, const struct tcphdr *, int); static void dyn_update_proto_state(struct dyn_data *, const struct ipfw_flow_id *, const void *, int, int); /* Functions to work with IPv4 states */ struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, const void *, struct ipfw_dyn_info *, int); static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, const void *, int, uint32_t, uint16_t); static struct dyn_ipv4_state *dyn_alloc_ipv4_state( const struct ipfw_flow_id *, uint16_t, uint8_t); static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, const struct ipfw_flow_id *, const void *, int, uint32_t, struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, ipfw_dyn_rule *); /* * Named states support. */ static char *default_state_name = "default"; struct dyn_state_obj { struct named_object no; char name[64]; }; #define DYN_STATE_OBJ(ch, cmd) \ ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1)) /* * Classifier callback. * Return 0 if opcode contains object that should be referenced * or rewritten. */ static int dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); /* Don't rewrite "check-state any" */ if (cmd->arg1 == 0 && cmd->opcode == O_CHECK_STATE) return (1); *puidx = cmd->arg1; *ptype = 0; return (0); } static void dyn_update(ipfw_insn *cmd, uint16_t idx) { cmd->arg1 = idx; DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); } static int dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { ipfw_obj_ntlv *ntlv; const char *name; DYN_DEBUG("uidx %d", ti->uidx); if (ti->uidx != 0) { if (ti->tlvs == NULL) return (EINVAL); /* Search ntlv in the buffer provided by user */ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_STATE_NAME); if (ntlv == NULL) return (EINVAL); name = ntlv->name; } else name = default_state_name; /* * Search named object with corresponding name. * Since states objects are global - ignore the set value * and use zero instead. */ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_STATE_NAME, name); /* * We always return success here. * The caller will check *pno and mark object as unresolved, * then it will automatically create "default" object. */ return (0); } static struct named_object * dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) { DYN_DEBUG("kidx %d", idx); return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); } static int dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, uint16_t *pkidx) { struct namedobj_instance *ni; struct dyn_state_obj *obj; struct named_object *no; ipfw_obj_ntlv *ntlv; char *name; DYN_DEBUG("uidx %d", ti->uidx); if (ti->uidx != 0) { if (ti->tlvs == NULL) return (EINVAL); ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_STATE_NAME); if (ntlv == NULL) return (EINVAL); name = ntlv->name; } else name = default_state_name; ni = CHAIN_TO_SRV(ch); obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); obj->no.name = obj->name; obj->no.etlv = IPFW_TLV_STATE_NAME; strlcpy(obj->name, name, sizeof(obj->name)); IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_STATE_NAME, name); if (no != NULL) { /* * Object is already created. * Just return its kidx and bump refcount. */ *pkidx = no->kidx; no->refcnt++; IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); DYN_DEBUG("\tfound kidx %d", *pkidx); return (0); } if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { DYN_DEBUG("\talloc_idx failed for %s", name); IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); return (ENOSPC); } ipfw_objhash_add(ni, &obj->no); SRV_OBJECT(ch, obj->no.kidx) = obj; obj->no.refcnt++; *pkidx = obj->no.kidx; IPFW_UH_WUNLOCK(ch); DYN_DEBUG("\tcreated kidx %d", *pkidx); return (0); } static void dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) { struct dyn_state_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); KASSERT(no->etlv == IPFW_TLV_STATE_NAME, ("%s: wrong object type %u", __func__, no->etlv)); KASSERT(no->refcnt == 1, ("Destroying object '%s' (type %u, idx %u) with refcnt %u", no->name, no->etlv, no->kidx, no->refcnt)); DYN_DEBUG("kidx %d", no->kidx); obj = SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; ipfw_objhash_del(CHAIN_TO_SRV(ch), no); ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); free(obj, M_IPFW); } static struct opcode_obj_rewrite dyn_opcodes[] = { { O_KEEP_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_CHECK_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_PROBE_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_LIMIT, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, }; /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ #ifndef IPFIREWALL_JENKINSHASH static __inline uint32_t hash_packet(const struct ipfw_flow_id *id) { uint32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3])); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip); i ^= (id->dst_port) ^ (id->src_port); return (i); } static __inline uint32_t hash_parent(const struct ipfw_flow_id *id, const void *rule) { return (hash_packet(id) ^ ((uintptr_t)rule)); } #else /* IPFIREWALL_JENKINSHASH */ VNET_DEFINE_STATIC(uint32_t, dyn_hashseed); #define V_dyn_hashseed VNET(dyn_hashseed) static __inline int addrcmp4(const struct ipfw_flow_id *id) { if (id->src_ip < id->dst_ip) return (0); if (id->src_ip > id->dst_ip) return (1); if (id->src_port <= id->dst_port) return (0); return (1); } #ifdef INET6 static __inline int addrcmp6(const struct ipfw_flow_id *id) { int ret; ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); if (ret < 0) return (0); if (ret > 0) return (1); if (id->src_port <= id->dst_port) return (0); return (1); } static __inline uint32_t hash_packet6(const struct ipfw_flow_id *id) { struct tuple6 { struct in6_addr addr[2]; uint16_t port[2]; } t6; if (addrcmp6(id) == 0) { t6.addr[0] = id->src_ip6; t6.addr[1] = id->dst_ip6; t6.port[0] = id->src_port; t6.port[1] = id->dst_port; } else { t6.addr[0] = id->dst_ip6; t6.addr[1] = id->src_ip6; t6.port[0] = id->dst_port; t6.port[1] = id->src_port; } return (jenkins_hash32((const uint32_t *)&t6, sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); } #endif static __inline uint32_t hash_packet(const struct ipfw_flow_id *id) { struct tuple4 { in_addr_t addr[2]; uint16_t port[2]; } t4; if (IS_IP4_FLOW_ID(id)) { /* All fields are in host byte order */ if (addrcmp4(id) == 0) { t4.addr[0] = id->src_ip; t4.addr[1] = id->dst_ip; t4.port[0] = id->src_port; t4.port[1] = id->dst_port; } else { t4.addr[0] = id->dst_ip; t4.addr[1] = id->src_ip; t4.port[0] = id->dst_port; t4.port[1] = id->src_port; } return (jenkins_hash32((const uint32_t *)&t4, sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); } else #ifdef INET6 if (IS_IP6_FLOW_ID(id)) return (hash_packet6(id)); #endif return (0); } static __inline uint32_t hash_parent(const struct ipfw_flow_id *id, const void *rule) { return (jenkins_hash32((const uint32_t *)&rule, sizeof(rule) / sizeof(uint32_t), hash_packet(id))); } #endif /* IPFIREWALL_JENKINSHASH */ /* * Print customizable flow id description via log(9) facility. */ static void print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type, int log_flags, char *prefix, char *postfix) { struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(id)) { ip6_sprintf(src, &id->src_ip6); ip6_sprintf(dst, &id->dst_ip6); } else #endif { da.s_addr = htonl(id->src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(id->dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", prefix, dyn_type, src, id->src_port, dst, id->dst_port, V_dyn_count, postfix); } #define print_dyn_rule(id, dtype, prefix, postfix) \ print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) #define TIME_LE(a,b) ((int)((a)-(b)) < 0) #define _SEQ_GE(a,b) ((int)((a)-(b)) >= 0) #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) #define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) #define ACK_FWD 0x00010000 /* fwd ack seen */ #define ACK_REV 0x00020000 /* rev ack seen */ #define ACK_BOTH (ACK_FWD | ACK_REV) static uint32_t dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, const struct tcphdr *tcp, int dir) { uint32_t ack, expire; uint32_t state, old; uint8_t th_flags; expire = data->expire; old = state = data->state; th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8); switch (state & TCP_FLAGS) { case TH_SYN: /* opening */ expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN: /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8): if (tcp == NULL) break; ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (data->ack_fwd == 0 || _SEQ_GE(ack, data->ack_fwd)) { state |= ACK_FWD; if (data->ack_fwd != ack) ck_pr_store_32(&data->ack_fwd, ack); } } else { if (data->ack_rev == 0 || _SEQ_GE(ack, data->ack_rev)) { state |= ACK_REV; if (data->ack_rev != ack) ck_pr_store_32(&data->ack_rev, ack); } } if ((state & ACK_BOTH) == ACK_BOTH) { /* * Set expire time to V_dyn_ack_lifetime only if * we got ACKs for both directions. * We use XOR here to avoid possible state * overwriting in concurrent thread. */ expire = time_uptime + V_dyn_ack_lifetime; ck_pr_xor_32(&data->state, ACK_BOTH); } else if ((data->state & ACK_BOTH) != (state & ACK_BOTH)) ck_pr_or_32(&data->state, state & ACK_BOTH); break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; expire = time_uptime + V_dyn_fin_lifetime; break; default: if (V_dyn_keepalive != 0 && V_dyn_rst_lifetime >= V_dyn_keepalive_period) V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; expire = time_uptime + V_dyn_rst_lifetime; } /* Save TCP state if it was changed */ if ((state & TCP_FLAGS) != (old & TCP_FLAGS)) ck_pr_or_32(&data->state, state & TCP_FLAGS); return (expire); } /* * Update ULP specific state. * For TCP we keep sequence numbers and flags. For other protocols * currently we update only expire time. Packets and bytes counters * are also updated here. */ static void dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, int dir) { uint32_t expire; /* NOTE: we are in critical section here. */ switch (pkt->proto) { case IPPROTO_UDP: case IPPROTO_UDPLITE: expire = time_uptime + V_dyn_udp_lifetime; break; case IPPROTO_TCP: expire = dyn_update_tcp_state(data, pkt, ulp, dir); break; default: expire = time_uptime + V_dyn_short_lifetime; } /* * Expiration timer has the per-second granularity, no need to update * it every time when state is matched. */ if (data->expire != expire) ck_pr_store_32(&data->expire, expire); if (dir == MATCH_FORWARD) DYN_COUNTER_INC(data, fwd, pktlen); else DYN_COUNTER_INC(data, rev, pktlen); } /* * Lookup IPv4 state. * Must be called in critical section. */ struct dyn_ipv4_state * dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp, struct ipfw_dyn_info *info, int pktlen) { struct dyn_ipv4_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); info->version = DYN_BUCKET_VERSION(bucket, ipv4_add); restart: version = DYN_BUCKET_VERSION(bucket, ipv4_del); CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv4_del)) goto restart; if (s->proto != pkt->proto) continue; if (info->kidx != 0 && s->kidx != info->kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { info->direction = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && s->src == pkt->dst_ip && s->dst == pkt->src_ip) { info->direction = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, info->direction); return (s); } /* * Lookup IPv4 state. * Simplifed version is used to check that matching state doesn't exist. */ static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) { struct dyn_ipv4_state *s; int dir; dir = MATCH_NONE; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { if (s->proto != pkt->proto || s->kidx != kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { dir = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && s->src == pkt->dst_ip && s->dst == pkt->src_ip) { dir = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); return (s != NULL); } struct dyn_ipv4_state * dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval) { struct dyn_ipv4_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); restart: version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del); CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del)) goto restart; /* * NOTE: we do not need to check kidx, because parent rule * can not create states with different kidx. * And parent rule always created for forward direction. */ if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { if (s->limit->expire != time_uptime + V_dyn_short_lifetime) ck_pr_store_32(&s->limit->expire, time_uptime + V_dyn_short_lifetime); break; } } return (s); } static struct dyn_ipv4_state * dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt, const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) { struct dyn_ipv4_state *s; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) break; } return (s); } #ifdef INET6 static uint32_t dyn_getscopeid(const struct ip_fw_args *args) { /* * If source or destination address is an scopeid address, we need * determine the scope zone id to resolve address scope ambiguity. */ if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) || - IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) { - MPASS(args->oif != NULL || - args->m->m_pkthdr.rcvif != NULL); - return (in6_getscopezone(args->oif != NULL ? args->oif: - args->m->m_pkthdr.rcvif, IPV6_ADDR_SCOPE_LINKLOCAL)); - } + IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) + return (in6_getscopezone(args->ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); + return (0); } /* * Lookup IPv6 state. * Must be called in critical section. */ static struct dyn_ipv6_state * dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, struct ipfw_dyn_info *info, int pktlen) { struct dyn_ipv6_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); info->version = DYN_BUCKET_VERSION(bucket, ipv6_add); restart: version = DYN_BUCKET_VERSION(bucket, ipv6_del); CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv6_del)) goto restart; if (s->proto != pkt->proto || s->zoneid != zoneid) continue; if (info->kidx != 0 && s->kidx != info->kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { info->direction = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { info->direction = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, info->direction); return (s); } /* * Lookup IPv6 state. * Simplifed version is used to check that matching state doesn't exist. */ static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) { struct dyn_ipv6_state *s; int dir; dir = MATCH_NONE; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { if (s->proto != pkt->proto || s->kidx != kidx || s->zoneid != zoneid) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { dir = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { dir = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); return (s != NULL); } static struct dyn_ipv6_state * dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval) { struct dyn_ipv6_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); restart: version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del); CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del)) goto restart; /* * NOTE: we do not need to check kidx, because parent rule * can not create states with different kidx. * Also parent rule always created for forward direction. */ if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->zoneid == zoneid && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { if (s->limit->expire != time_uptime + V_dyn_short_lifetime) ck_pr_store_32(&s->limit->expire, time_uptime + V_dyn_short_lifetime); break; } } return (s); } static struct dyn_ipv6_state * dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) { struct dyn_ipv6_state *s; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->zoneid == zoneid && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) break; } return (s); } #endif /* INET6 */ /* * Lookup dynamic state. * pkt - filled by ipfw_chk() ipfw_flow_id; * ulp - determined by ipfw_chk() upper level protocol header; * dyn_info - info about matched state to return back; * Returns pointer to state's parent rule and dyn_info. If there is * no state, NULL is returned. * On match ipfw_dyn_lookup() updates state's counters. */ struct ip_fw * ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info) { struct dyn_data *data; struct ip_fw *rule; IPFW_RLOCK_ASSERT(&V_layer3_chain); data = NULL; rule = NULL; info->kidx = cmd->arg1; info->direction = MATCH_NONE; info->hashval = hash_packet(&args->f_id); DYNSTATE_CRITICAL_ENTER(); if (IS_IP4_FLOW_ID(&args->f_id)) { struct dyn_ipv4_state *s; s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen); if (s != NULL) { /* * Dynamic states are created using the same 5-tuple, * so it is assumed, that parent rule for O_LIMIT * state has the same address family. */ data = s->data; if (s->type == O_LIMIT) { s = data->parent; rule = s->limit->parent; } else rule = data->parent; } } #ifdef INET6 else if (IS_IP6_FLOW_ID(&args->f_id)) { struct dyn_ipv6_state *s; s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args), ulp, info, pktlen); if (s != NULL) { data = s->data; if (s->type == O_LIMIT) { s = data->parent; rule = s->limit->parent; } else rule = data->parent; } } #endif if (data != NULL) { /* * If cached chain id is the same, we can avoid rule index * lookup. Otherwise do lookup and update chain_id and f_pos. * It is safe even if there is concurrent thread that want * update the same state, because chain->id can be changed * only under IPFW_WLOCK(). */ if (data->chain_id != V_layer3_chain.id) { data->f_pos = ipfw_find_rule(&V_layer3_chain, data->rulenum, data->ruleid); /* * Check that found state has not orphaned. * When chain->id being changed the parent * rule can be deleted. If found rule doesn't * match the parent pointer, consider this * result as MATCH_NONE and return NULL. * * This will lead to creation of new similar state * that will be added into head of this bucket. * And the state that we currently have matched * should be deleted by dyn_expire_states(). * * In case when dyn_keep_states is enabled, return * pointer to deleted rule and f_pos value * corresponding to penultimate rule. * When we have enabled V_dyn_keep_states, states * that become orphaned will get the DYN_REFERENCED * flag and rule will keep around. So we can return * it. But since it is not in the rules map, we need * return such f_pos value, so after the state * handling if the search will continue, the next rule * will be the last one - the default rule. */ if (V_layer3_chain.map[data->f_pos] == rule) { data->chain_id = V_layer3_chain.id; info->f_pos = data->f_pos; } else if (V_dyn_keep_states != 0) { /* * The original rule pointer is still usable. * So, we return it, but f_pos need to be * changed to point to the penultimate rule. */ MPASS(V_layer3_chain.n_rules > 1); data->chain_id = V_layer3_chain.id; data->f_pos = V_layer3_chain.n_rules - 2; info->f_pos = data->f_pos; } else { rule = NULL; info->direction = MATCH_NONE; DYN_DEBUG("rule %p [%u, %u] is considered " "invalid in data %p", rule, data->ruleid, data->rulenum, data); /* info->f_pos doesn't matter here. */ } } else info->f_pos = data->f_pos; } DYNSTATE_CRITICAL_EXIT(); #if 0 /* * Return MATCH_NONE if parent rule is in disabled set. * This will lead to creation of new similar state that * will be added into head of this bucket. * * XXXAE: we need to be able update state's set when parent * rule set is changed. */ if (rule != NULL && (V_set_disable & (1 << rule->set))) { rule = NULL; info->direction = MATCH_NONE; } #endif return (rule); } static struct dyn_parent * dyn_alloc_parent(void *parent, uint32_t ruleid, uint16_t rulenum, uint32_t hashval) { struct dyn_parent *limit; limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO); if (limit == NULL) { if (last_log != time_uptime) { last_log = time_uptime; log(LOG_DEBUG, "ipfw: Cannot allocate parent dynamic state, " "consider increasing " "net.inet.ip.fw.dyn_parent_max\n"); } return (NULL); } limit->parent = parent; limit->ruleid = ruleid; limit->rulenum = rulenum; limit->hashval = hashval; limit->expire = time_uptime + V_dyn_short_lifetime; return (limit); } static struct dyn_data * dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint16_t rulenum, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, uint32_t hashval, uint16_t fibnum) { struct dyn_data *data; data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO); if (data == NULL) { if (last_log != time_uptime) { last_log = time_uptime; log(LOG_DEBUG, "ipfw: Cannot allocate dynamic state, " "consider increasing net.inet.ip.fw.dyn_max\n"); } return (NULL); } data->parent = parent; data->ruleid = ruleid; data->rulenum = rulenum; data->fibnum = fibnum; data->hashval = hashval; data->expire = time_uptime + V_dyn_syn_lifetime; dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD); return (data); } static struct dyn_ipv4_state * dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint16_t kidx, uint8_t type) { struct dyn_ipv4_state *s; s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO); if (s == NULL) return (NULL); s->type = type; s->kidx = kidx; s->proto = pkt->proto; s->sport = pkt->src_port; s->dport = pkt->dst_port; s->src = pkt->src_ip; s->dst = pkt->dst_ip; return (s); } /* * Add IPv4 parent state. * Returns pointer to parent state. When it is not NULL we are in * critical section and pointer protected by hazard pointer. * When some error occurs, it returns NULL and exit from critical section * is not needed. */ static struct dyn_ipv4_state * dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint16_t rulenum, const struct ipfw_flow_id *pkt, uint32_t hashval, uint32_t version, uint16_t kidx) { struct dyn_ipv4_state *s; struct dyn_parent *limit; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid, rulenum, bucket); if (s != NULL) { /* * Simultaneous thread has already created this * state. Just return it. */ DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } } limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); if (limit == NULL) { DYN_BUCKET_UNLOCK(bucket); return (NULL); } s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_parent_zone, limit); return (NULL); } s->limit = limit; CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry); DYN_COUNT_INC(dyn_parent_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add); DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } static int dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint16_t rulenum, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, uint16_t kidx, uint8_t type) { struct dyn_ipv4_state *s; void *data; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (info->direction == MATCH_UNKNOWN || info->kidx != kidx || info->hashval != hashval || info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, bucket, kidx) != 0) { DYN_BUCKET_UNLOCK(bucket); return (EEXIST); } } data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, pktlen, hashval, fibnum); if (data == NULL) { DYN_BUCKET_UNLOCK(bucket); return (ENOMEM); } s = dyn_alloc_ipv4_state(pkt, kidx, type); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_data_zone, data); return (ENOMEM); } s->data = data; CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry); DYN_COUNT_INC(dyn_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add); DYN_BUCKET_UNLOCK(bucket); return (0); } #ifdef INET6 static struct dyn_ipv6_state * dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, uint16_t kidx, uint8_t type) { struct dyn_ipv6_state *s; s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO); if (s == NULL) return (NULL); s->type = type; s->kidx = kidx; s->zoneid = zoneid; s->proto = pkt->proto; s->sport = pkt->src_port; s->dport = pkt->dst_port; s->src = pkt->src_ip6; s->dst = pkt->dst_ip6; return (s); } /* * Add IPv6 parent state. * Returns pointer to parent state. When it is not NULL we are in * critical section and pointer protected by hazard pointer. * When some error occurs, it return NULL and exit from critical section * is not needed. */ static struct dyn_ipv6_state * dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint16_t rulenum, const struct ipfw_flow_id *pkt, uint32_t zoneid, uint32_t hashval, uint32_t version, uint16_t kidx) { struct dyn_ipv6_state *s; struct dyn_parent *limit; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid, rulenum, bucket); if (s != NULL) { /* * Simultaneous thread has already created this * state. Just return it. */ DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } } limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); if (limit == NULL) { DYN_BUCKET_UNLOCK(bucket); return (NULL); } s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_parent_zone, limit); return (NULL); } s->limit = limit; CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry); DYN_COUNT_INC(dyn_parent_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add); DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } static int dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint16_t rulenum, const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, uint16_t kidx, uint8_t type) { struct dyn_ipv6_state *s; struct dyn_data *data; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (info->direction == MATCH_UNKNOWN || info->kidx != kidx || info->hashval != hashval || info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen, bucket, kidx) != 0) { DYN_BUCKET_UNLOCK(bucket); return (EEXIST); } } data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, pktlen, hashval, fibnum); if (data == NULL) { DYN_BUCKET_UNLOCK(bucket); return (ENOMEM); } s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_data_zone, data); return (ENOMEM); } s->data = data; CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry); DYN_COUNT_INC(dyn_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add); DYN_BUCKET_UNLOCK(bucket); return (0); } #endif /* INET6 */ static void * dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint16_t kidx) { char sbuf[24]; struct dyn_parent *p; void *ret; uint32_t bucket, version; p = NULL; ret = NULL; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYNSTATE_CRITICAL_ENTER(); if (IS_IP4_FLOW_ID(pkt)) { struct dyn_ipv4_state *s; version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add); s = dyn_lookup_ipv4_parent(pkt, rule, rule->id, rule->rulenum, bucket); if (s == NULL) { /* * Exit from critical section because dyn_add_parent() * will acquire bucket lock. */ DYNSTATE_CRITICAL_EXIT(); s = dyn_add_ipv4_parent(rule, rule->id, rule->rulenum, pkt, hashval, version, kidx); if (s == NULL) return (NULL); /* Now we are in critical section again. */ } ret = s; p = s->limit; } #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) { struct dyn_ipv6_state *s; version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add); s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id, rule->rulenum, bucket); if (s == NULL) { /* * Exit from critical section because dyn_add_parent() * can acquire bucket mutex. */ DYNSTATE_CRITICAL_EXIT(); s = dyn_add_ipv6_parent(rule, rule->id, rule->rulenum, pkt, zoneid, hashval, version, kidx); if (s == NULL) return (NULL); /* Now we are in critical section again. */ } ret = s; p = s->limit; } #endif else { DYNSTATE_CRITICAL_EXIT(); return (NULL); } /* Check the limit */ if (DPARENT_COUNT(p) >= limit) { DYNSTATE_CRITICAL_EXIT(); if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; snprintf(sbuf, sizeof(sbuf), "%u drop session", rule->rulenum); print_dyn_rule_flags(pkt, O_LIMIT, LOG_SECURITY | LOG_DEBUG, sbuf, "too many entries"); } return (NULL); } /* Take new session into account. */ DPARENT_COUNT_INC(p); /* * We must exit from critical section because the following code * can acquire bucket mutex. * We rely on the the 'count' field. The state will not expire * until it has some child states, i.e. 'count' field is not zero. * Return state pointer, it will be used by child states as parent. */ DYNSTATE_CRITICAL_EXIT(); return (ret); } static int dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, uint16_t fibnum, const void *ulp, int pktlen, struct ip_fw *rule, struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask, uint16_t kidx, uint8_t type) { struct ipfw_flow_id id; uint32_t hashval, parent_hashval, ruleid, rulenum; int ret; MPASS(type == O_LIMIT || type == O_KEEP_STATE); ruleid = rule->id; rulenum = rule->rulenum; if (type == O_LIMIT) { /* Create masked flow id and calculate bucket */ id.addr_type = pkt->addr_type; id.proto = pkt->proto; id.fib = fibnum; /* unused */ id.src_port = (limit_mask & DYN_SRC_PORT) ? pkt->src_port: 0; id.dst_port = (limit_mask & DYN_DST_PORT) ? pkt->dst_port: 0; if (IS_IP4_FLOW_ID(pkt)) { id.src_ip = (limit_mask & DYN_SRC_ADDR) ? pkt->src_ip: 0; id.dst_ip = (limit_mask & DYN_DST_ADDR) ? pkt->dst_ip: 0; } #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = pkt->src_ip6; else memset(&id.src_ip6, 0, sizeof(id.src_ip6)); if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = pkt->dst_ip6; else memset(&id.dst_ip6, 0, sizeof(id.dst_ip6)); } #endif else return (EAFNOSUPPORT); parent_hashval = hash_parent(&id, rule); rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval, limit, kidx); if (rule == NULL) { #if 0 if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; snprintf(sbuf, sizeof(sbuf), "%u drop session", rule->rulenum); print_dyn_rule_flags(pkt, O_LIMIT, LOG_SECURITY | LOG_DEBUG, sbuf, "too many entries"); } #endif return (EACCES); } /* * Limit is not reached, create new state. * Now rule points to parent state. */ } hashval = hash_packet(pkt); if (IS_IP4_FLOW_ID(pkt)) ret = dyn_add_ipv4_state(rule, ruleid, rulenum, pkt, ulp, pktlen, hashval, info, fibnum, kidx, type); #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) ret = dyn_add_ipv6_state(rule, ruleid, rulenum, pkt, zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type); #endif /* INET6 */ else ret = EAFNOSUPPORT; if (type == O_LIMIT) { if (ret != 0) { /* * We failed to create child state for O_LIMIT * opcode. Since we already counted it in the parent, * we must revert counter back. The 'rule' points to * parent state, use it to get dyn_parent. * * XXXAE: it should be safe to use 'rule' pointer * without extra lookup, parent state is referenced * and should not be freed. */ if (IS_IP4_FLOW_ID(&id)) DPARENT_COUNT_DEC( ((struct dyn_ipv4_state *)rule)->limit); #ifdef INET6 else if (IS_IP6_FLOW_ID(&id)) DPARENT_COUNT_DEC( ((struct dyn_ipv6_state *)rule)->limit); #endif } } /* * EEXIST means that simultaneous thread has created this * state. Consider this as success. * * XXXAE: should we invalidate 'info' content here? */ if (ret == EEXIST) return (0); return (ret); } /* * Install dynamic state. * chain - ipfw's instance; * rule - the parent rule that installs the state; * cmd - opcode that installs the state; * args - ipfw arguments; * ulp - upper level protocol header; * pktlen - packet length; * info - dynamic state lookup info; * tablearg - tablearg id. * * Returns non-zero value (failure) if state is not installed because * of errors or because session limitations are enforced. */ int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, const ipfw_insn_limit *cmd, const struct ip_fw_args *args, const void *ulp, int pktlen, struct ipfw_dyn_info *info, uint32_t tablearg) { uint32_t limit; uint16_t limit_mask; if (cmd->o.opcode == O_LIMIT) { limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); limit_mask = cmd->limit_mask; } else { limit = 0; limit_mask = 0; } return (dyn_install_state(&args->f_id, #ifdef INET6 IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args): #endif 0, M_GETFIB(args->m), ulp, pktlen, rule, info, limit, limit_mask, cmd->o.arg1, cmd->o.opcode)); } /* * Free safe to remove state entries from expired lists. */ static void dyn_free_states(struct ip_fw_chain *chain) { struct dyn_ipv4_state *s4, *s4n; #ifdef INET6 struct dyn_ipv6_state *s6, *s6n; #endif int cached_count, i; /* * We keep pointers to objects that are in use on each CPU * in the per-cpu dyn_hp pointer. When object is going to be * removed, first of it is unlinked from the corresponding * list. This leads to changing of dyn_bucket_xxx_delver version. * Unlinked objects is placed into corresponding dyn_expired_xxx * list. Reader that is going to dereference object pointer checks * dyn_bucket_xxx_delver version before and after storing pointer * into dyn_hp. If version is the same, the object is protected * from freeing and it is safe to dereference. Othervise reader * tries to iterate list again from the beginning, but this object * now unlinked and thus will not be accessible. * * Copy dyn_hp pointers for each CPU into dyn_hp_cache array. * It does not matter that some pointer can be changed in * time while we are copying. We need to check, that objects * removed in the previous pass are not in use. And if dyn_hp * pointer does not contain it in the time when we are copying, * it will not appear there, because it is already unlinked. * And for new pointers we will not free objects that will be * unlinked in this pass. */ cached_count = 0; CPU_FOREACH(i) { dyn_hp_cache[cached_count] = DYNSTATE_GET(i); if (dyn_hp_cache[cached_count] != NULL) cached_count++; } /* * Free expired states that are safe to free. * Check each entry from previous pass in the dyn_expired_xxx * list, if pointer to the object is in the dyn_hp_cache array, * keep it until next pass. Otherwise it is safe to free the * object. * * XXXAE: optimize this to use SLIST_REMOVE_AFTER. */ #define DYN_FREE_STATES(s, next, name) do { \ s = SLIST_FIRST(&V_dyn_expired_ ## name); \ while (s != NULL) { \ next = SLIST_NEXT(s, expired); \ for (i = 0; i < cached_count; i++) \ if (dyn_hp_cache[i] == s) \ break; \ if (i == cached_count) { \ if (s->type == O_LIMIT_PARENT && \ s->limit->count != 0) { \ s = next; \ continue; \ } \ SLIST_REMOVE(&V_dyn_expired_ ## name, \ s, dyn_ ## name ## _state, expired); \ if (s->type == O_LIMIT_PARENT) \ uma_zfree(V_dyn_parent_zone, s->limit); \ else \ uma_zfree(V_dyn_data_zone, s->data); \ uma_zfree(V_dyn_ ## name ## _zone, s); \ } \ s = next; \ } \ } while (0) /* * Protect access to expired lists with DYN_EXPIRED_LOCK. * Userland can invoke ipfw_expire_dyn_states() to delete * specific states, this will lead to modification of expired * lists. * * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use * IPFW_UH_WLOCK to protect access to these lists. */ DYN_EXPIRED_LOCK(); DYN_FREE_STATES(s4, s4n, ipv4); #ifdef INET6 DYN_FREE_STATES(s6, s6n, ipv6); #endif DYN_EXPIRED_UNLOCK(); #undef DYN_FREE_STATES } /* * Returns: * 0 when state is not matched by specified range; * 1 when state is matched by specified range; * 2 when state is matched by specified range and requested deletion of * dynamic states. */ static int dyn_match_range(uint16_t rulenum, uint8_t set, const ipfw_range_tlv *rt) { MPASS(rt != NULL); /* flush all states */ if (rt->flags & IPFW_RCFLAG_ALL) { if (rt->flags & IPFW_RCFLAG_DYNAMIC) return (2); /* forced */ return (1); } if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set) return (0); if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && (rulenum < rt->start_rule || rulenum > rt->end_rule)) return (0); if (rt->flags & IPFW_RCFLAG_DYNAMIC) return (2); return (1); } static void dyn_acquire_rule(struct ip_fw_chain *ch, struct dyn_data *data, struct ip_fw *rule, uint16_t kidx) { struct dyn_state_obj *obj; /* * Do not acquire reference twice. * This can happen when rule deletion executed for * the same range, but different ruleset id. */ if (data->flags & DYN_REFERENCED) return; IPFW_UH_WLOCK_ASSERT(ch); MPASS(kidx != 0); data->flags |= DYN_REFERENCED; /* Reference the named object */ obj = SRV_OBJECT(ch, kidx); obj->no.refcnt++; MPASS(obj->no.etlv == IPFW_TLV_STATE_NAME); /* Reference the parent rule */ rule->refcnt++; } static void dyn_release_rule(struct ip_fw_chain *ch, struct dyn_data *data, struct ip_fw *rule, uint16_t kidx) { struct dyn_state_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); MPASS(kidx != 0); obj = SRV_OBJECT(ch, kidx); if (obj->no.refcnt == 1) dyn_destroy(ch, &obj->no); else obj->no.refcnt--; if (--rule->refcnt == 1) ipfw_free_rule(rule); } /* * We do not keep O_LIMIT_PARENT states when V_dyn_keep_states is enabled. * O_LIMIT state is created when new connection is going to be established * and there is no matching state. So, since the old parent rule was deleted * we can't create new states with old parent, and thus we can not account * new connections with already established connections, and can not do * proper limiting. */ static int dyn_match_ipv4_state(struct ip_fw_chain *ch, struct dyn_ipv4_state *s, const ipfw_range_tlv *rt) { struct ip_fw *rule; int ret; if (s->type == O_LIMIT_PARENT) { rule = s->limit->parent; return (dyn_match_range(s->limit->rulenum, rule->set, rt)); } rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv4_state *)rule)->limit->parent; ret = dyn_match_range(s->data->rulenum, rule->set, rt); if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) return (ret); dyn_acquire_rule(ch, s->data, rule, s->kidx); return (0); } #ifdef INET6 static int dyn_match_ipv6_state(struct ip_fw_chain *ch, struct dyn_ipv6_state *s, const ipfw_range_tlv *rt) { struct ip_fw *rule; int ret; if (s->type == O_LIMIT_PARENT) { rule = s->limit->parent; return (dyn_match_range(s->limit->rulenum, rule->set, rt)); } rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv6_state *)rule)->limit->parent; ret = dyn_match_range(s->data->rulenum, rule->set, rt); if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) return (ret); dyn_acquire_rule(ch, s->data, rule, s->kidx); return (0); } #endif /* * Unlink expired entries from states lists. * @rt can be used to specify the range of states for deletion. */ static void dyn_expire_states(struct ip_fw_chain *ch, ipfw_range_tlv *rt) { struct dyn_ipv4_slist expired_ipv4; #ifdef INET6 struct dyn_ipv6_slist expired_ipv6; struct dyn_ipv6_state *s6, *s6n, *s6p; #endif struct dyn_ipv4_state *s4, *s4n, *s4p; void *rule; int bucket, removed, length, max_length; IPFW_UH_WLOCK_ASSERT(ch); /* * Unlink expired states from each bucket. * With acquired bucket lock iterate entries of each lists: * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time * and unlink entry from the list, link entry into temporary * expired_xxx lists then bump "del" bucket version. * * When an entry is removed, corresponding states counter is * decremented. If entry has O_LIMIT type, parent's reference * counter is decremented. * * NOTE: this function can be called from userspace context * when user deletes rules. In this case all matched states * will be forcedly unlinked. O_LIMIT_PARENT states will be kept * in the expired lists until reference counter become zero. */ #define DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra) do { \ length = 0; \ removed = 0; \ prev = NULL; \ s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]); \ while (s != NULL) { \ next = CK_SLIST_NEXT(s, entry); \ if ((TIME_LEQ((s)->exp, time_uptime) && extra) || \ (rt != NULL && \ dyn_match_ ## af ## _state(ch, s, rt))) { \ if (prev != NULL) \ CK_SLIST_REMOVE_AFTER(prev, entry); \ else \ CK_SLIST_REMOVE_HEAD( \ &V_dyn_ ## name [bucket], entry); \ removed++; \ SLIST_INSERT_HEAD(&expired_ ## af, s, expired); \ if (s->type == O_LIMIT_PARENT) \ DYN_COUNT_DEC(dyn_parent_count); \ else { \ DYN_COUNT_DEC(dyn_count); \ if (s->data->flags & DYN_REFERENCED) { \ rule = s->data->parent; \ if (s->type == O_LIMIT) \ rule = ((__typeof(s)) \ rule)->limit->parent;\ dyn_release_rule(ch, s->data, \ rule, s->kidx); \ } \ if (s->type == O_LIMIT) { \ s = s->data->parent; \ DPARENT_COUNT_DEC(s->limit); \ } \ } \ } else { \ prev = s; \ length++; \ } \ s = next; \ } \ if (removed != 0) \ DYN_BUCKET_VERSION_BUMP(bucket, name ## _del); \ if (length > max_length) \ max_length = length; \ } while (0) SLIST_INIT(&expired_ipv4); #ifdef INET6 SLIST_INIT(&expired_ipv6); #endif max_length = 0; for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_BUCKET_LOCK(bucket); DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1); DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4, ipv4_parent, (s4->limit->count == 0)); #ifdef INET6 DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1); DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6, ipv6_parent, (s6->limit->count == 0)); #endif DYN_BUCKET_UNLOCK(bucket); } /* Update curr_max_length for statistics. */ V_curr_max_length = max_length; /* * Concatenate temporary lists with global expired lists. */ DYN_EXPIRED_LOCK(); SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4, dyn_ipv4_state, expired); #ifdef INET6 SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6, dyn_ipv6_state, expired); #endif DYN_EXPIRED_UNLOCK(); #undef DYN_UNLINK_STATES #undef DYN_UNREF_STATES } static struct mbuf * dyn_mgethdr(int len, uint16_t fibnum) { struct mbuf *m; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #ifdef MAC mac_netinet_firewall_send(m); #endif M_SETFIB(m, fibnum); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_len = m->m_pkthdr.len = len; bzero(m->m_data, len); return (m); } static void dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst, uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) { struct tcphdr *tcp; struct ip *ip; ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(m->m_len); ip->ip_off |= htons(IP_DF); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_TCP; ip->ip_src.s_addr = htonl(src); ip->ip_dst.s_addr = htonl(dst); tcp = mtodo(m, sizeof(struct ip)); tcp->th_sport = htons(sport); tcp->th_dport = htons(dport); tcp->th_off = sizeof(struct tcphdr) >> 2; tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); m->m_pkthdr.csum_flags = CSUM_TCP; } static void dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s) { struct mbuf *m; if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv4(m, s->dst, s->src, s->data->ack_fwd - 1, s->data->ack_rev, s->dport, s->sport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv4 " "keepalive queue is reached.\n"); return; } } } if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv4(m, s->src, s->dst, s->data->ack_rev - 1, s->data->ack_fwd, s->sport, s->dport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv4 " "keepalive queue is reached.\n"); return; } } } } /* * Prepare and send keep-alive packets. */ static void dyn_send_keepalive_ipv4(struct ip_fw_chain *chain) { struct mbufq q; struct mbuf *m; struct dyn_ipv4_state *s; uint32_t bucket; mbufq_init(&q, INT_MAX); IPFW_UH_RLOCK(chain); /* * It is safe to not use hazard pointer and just do lockless * access to the lists, because states entries can not be deleted * while we hold IPFW_UH_RLOCK. */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { /* * Only established TCP connections that will * become expired withing dyn_keepalive_interval. */ if (s->proto != IPPROTO_TCP || (s->data->state & BOTH_SYN) != BOTH_SYN || TIME_LEQ(time_uptime + V_dyn_keepalive_interval, s->data->expire)) continue; dyn_enqueue_keepalive_ipv4(&q, s); } } IPFW_UH_RUNLOCK(chain); while ((m = mbufq_dequeue(&q)) != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } #ifdef INET6 static void dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src, const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) { struct tcphdr *tcp; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = *src; if (IN6_IS_ADDR_LINKLOCAL(src)) ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff); ip6->ip6_dst = *dst; if (IN6_IS_ADDR_LINKLOCAL(dst)) ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff); tcp = mtodo(m, sizeof(struct ip6_hdr)); tcp->th_sport = htons(sport); tcp->th_dport = htons(dport); tcp->th_off = sizeof(struct tcphdr) >> 2; tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr), IPPROTO_TCP, 0); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; } static void dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s) { struct mbuf *m; if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { m = dyn_mgethdr(sizeof(struct ip6_hdr) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv6(m, &s->dst, &s->src, s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev, s->dport, s->sport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv6 " "keepalive queue is reached.\n"); return; } } } if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { m = dyn_mgethdr(sizeof(struct ip6_hdr) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv6(m, &s->src, &s->dst, s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd, s->sport, s->dport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv6 " "keepalive queue is reached.\n"); return; } } } } static void dyn_send_keepalive_ipv6(struct ip_fw_chain *chain) { struct mbufq q; struct mbuf *m; struct dyn_ipv6_state *s; uint32_t bucket; mbufq_init(&q, INT_MAX); IPFW_UH_RLOCK(chain); /* * It is safe to not use hazard pointer and just do lockless * access to the lists, because states entries can not be deleted * while we hold IPFW_UH_RLOCK. */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { /* * Only established TCP connections that will * become expired withing dyn_keepalive_interval. */ if (s->proto != IPPROTO_TCP || (s->data->state & BOTH_SYN) != BOTH_SYN || TIME_LEQ(time_uptime + V_dyn_keepalive_interval, s->data->expire)) continue; dyn_enqueue_keepalive_ipv6(&q, s); } } IPFW_UH_RUNLOCK(chain); while ((m = mbufq_dequeue(&q)) != NULL) ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif /* INET6 */ static void dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new) { #ifdef INET6 struct dyn_ipv6ck_slist *ipv6, *ipv6_parent; uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del; struct dyn_ipv6_state *s6; #endif struct dyn_ipv4ck_slist *ipv4, *ipv4_parent; uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del; struct dyn_ipv4_state *s4; struct mtx *bucket_lock; void *tmp; uint32_t bucket; MPASS(powerof2(new)); DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new); /* * Allocate and initialize new lists. * XXXAE: on memory pressure this can disable callout timer. */ bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW, M_WAITOK | M_ZERO); ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, M_WAITOK | M_ZERO); ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, M_WAITOK | M_ZERO); ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); #ifdef INET6 ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, M_WAITOK | M_ZERO); ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, M_WAITOK | M_ZERO); ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); #endif for (bucket = 0; bucket < new; bucket++) { DYN_BUCKET_LOCK_INIT(bucket_lock, bucket); CK_SLIST_INIT(&ipv4[bucket]); CK_SLIST_INIT(&ipv4_parent[bucket]); #ifdef INET6 CK_SLIST_INIT(&ipv6[bucket]); CK_SLIST_INIT(&ipv6_parent[bucket]); #endif } #define DYN_RELINK_STATES(s, hval, i, head, ohead) do { \ while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) { \ CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry); \ CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)], \ s, entry); \ } \ } while (0) /* * Prevent rules changing from userland. */ IPFW_UH_WLOCK(chain); /* * Hold traffic processing until we finish resize to * prevent access to states lists. */ IPFW_WLOCK(chain); /* Re-link all dynamic states */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4); DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent, ipv4_parent); #ifdef INET6 DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6); DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent, ipv6_parent); #endif } #define DYN_SWAP_PTR(old, new, tmp) do { \ tmp = old; \ old = new; \ new = tmp; \ } while (0) /* Swap pointers */ DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp); DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp); DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp); DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp); #ifdef INET6 DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp); DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp); DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp); #endif bucket = V_curr_dyn_buckets; V_curr_dyn_buckets = new; IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); /* Release old resources */ while (bucket-- != 0) DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket); free(bucket_lock, M_IPFW); free(ipv4, M_IPFW); free(ipv4_parent, M_IPFW); free(ipv4_add, M_IPFW); free(ipv4_parent_add, M_IPFW); free(ipv4_del, M_IPFW); free(ipv4_parent_del, M_IPFW); #ifdef INET6 free(ipv6, M_IPFW); free(ipv6_parent, M_IPFW); free(ipv6_add, M_IPFW); free(ipv6_parent_add, M_IPFW); free(ipv6_del, M_IPFW); free(ipv6_parent_del, M_IPFW); #endif } /* * This function is used to perform various maintenance * on dynamic hash lists. Currently it is called every second. */ static void dyn_tick(void *vnetx) { uint32_t buckets; CURVNET_SET((struct vnet *)vnetx); /* * First free states unlinked in previous passes. */ dyn_free_states(&V_layer3_chain); /* * Now unlink others expired states. * We use IPFW_UH_WLOCK to avoid concurrent call of * dyn_expire_states(). It is the only function that does * deletion of state entries from states lists. */ IPFW_UH_WLOCK(&V_layer3_chain); dyn_expire_states(&V_layer3_chain, NULL); IPFW_UH_WUNLOCK(&V_layer3_chain); /* * Send keepalives if they are enabled and the time has come. */ if (V_dyn_keepalive != 0 && V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) { V_dyn_keepalive_last = time_uptime; dyn_send_keepalive_ipv4(&V_layer3_chain); #ifdef INET6 dyn_send_keepalive_ipv6(&V_layer3_chain); #endif } /* * Check if we need to resize the hash: * if current number of states exceeds number of buckets in hash, * and dyn_buckets_max permits to grow the number of buckets, then * do it. Grow hash size to the minimum power of 2 which is bigger * than current states count. */ if (V_curr_dyn_buckets < V_dyn_buckets_max && (V_curr_dyn_buckets < V_dyn_count / 2 || ( V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) { buckets = 1 << fls(V_dyn_count); if (buckets > V_dyn_buckets_max) buckets = V_dyn_buckets_max; dyn_grow_hashtable(&V_layer3_chain, buckets); } callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0); CURVNET_RESTORE(); } void ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { /* * Do not perform any checks if we currently have no dynamic states */ if (V_dyn_count == 0) return; IPFW_UH_WLOCK_ASSERT(chain); dyn_expire_states(chain, rt); } /* * Pass through all states and reset eaction for orphaned rules. */ void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; struct ip_fw *rule; uint32_t bucket; #define DYN_RESET_EACTION(s, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ if ((s->data->flags & DYN_REFERENCED) == 0) \ continue; \ rule = s->data->parent; \ if (s->type == O_LIMIT) \ rule = ((__typeof(s))rule)->limit->parent; \ ipfw_reset_eaction(ch, rule, eaction_id, \ default_id, instance_id); \ } IPFW_UH_WLOCK_ASSERT(ch); if (V_dyn_count == 0) return; for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_RESET_EACTION(s4, ipv4, bucket); #ifdef INET6 DYN_RESET_EACTION(s6, ipv6, bucket); #endif } } /* * Returns size of dynamic states in legacy format */ int ipfw_dyn_len(void) { return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule)); } /* * Returns number of dynamic states. * Marks every named object index used by dynamic states with bit in @bmask. * Returns number of named objects accounted in bmask via @nocnt. * Used by dump format v1 (current). */ uint32_t ipfw_dyn_get_count(uint32_t *bmask, int *nocnt) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; uint32_t bucket; #define DYN_COUNT_OBJECTS(s, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ MPASS(s->kidx != 0); \ if (ipfw_mark_object_kidx(bmask, IPFW_TLV_STATE_NAME, \ s->kidx) != 0) \ (*nocnt)++; \ } IPFW_UH_RLOCK_ASSERT(&V_layer3_chain); /* No need to pass through all the buckets. */ *nocnt = 0; if (V_dyn_count + V_dyn_parent_count == 0) return (0); for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_COUNT_OBJECTS(s4, ipv4, bucket); #ifdef INET6 DYN_COUNT_OBJECTS(s6, ipv6, bucket); #endif } return (V_dyn_count + V_dyn_parent_count); } /* * Check if rule contains at least one dynamic opcode. * * Returns 1 if such opcode is found, 0 otherwise. */ int ipfw_is_dyn_rule(struct ip_fw *rule) { int cmdlen, l; ipfw_insn *cmd; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); switch (cmd->opcode) { case O_LIMIT: case O_KEEP_STATE: case O_PROBE_STATE: case O_CHECK_STATE: return (1); } } return (0); } static void dyn_export_parent(const struct dyn_parent *p, uint16_t kidx, uint8_t set, ipfw_dyn_rule *dst) { dst->dyn_type = O_LIMIT_PARENT; dst->kidx = kidx; dst->count = (uint16_t)DPARENT_COUNT(p); dst->expire = TIME_LEQ(p->expire, time_uptime) ? 0: p->expire - time_uptime; /* 'rule' is used to pass up the rule number and set */ memcpy(&dst->rule, &p->rulenum, sizeof(p->rulenum)); /* store set number into high word of dst->rule pointer. */ memcpy((char *)&dst->rule + sizeof(p->rulenum), &set, sizeof(set)); /* unused fields */ dst->pcnt = 0; dst->bcnt = 0; dst->parent = NULL; dst->state = 0; dst->ack_fwd = 0; dst->ack_rev = 0; dst->bucket = p->hashval; /* * The legacy userland code will interpret a NULL here as a marker * for the last dynamic rule. */ dst->next = (ipfw_dyn_rule *)1; } static void dyn_export_data(const struct dyn_data *data, uint16_t kidx, uint8_t type, uint8_t set, ipfw_dyn_rule *dst) { dst->dyn_type = type; dst->kidx = kidx; dst->pcnt = data->pcnt_fwd + data->pcnt_rev; dst->bcnt = data->bcnt_fwd + data->bcnt_rev; dst->expire = TIME_LEQ(data->expire, time_uptime) ? 0: data->expire - time_uptime; /* 'rule' is used to pass up the rule number and set */ memcpy(&dst->rule, &data->rulenum, sizeof(data->rulenum)); /* store set number into high word of dst->rule pointer. */ memcpy((char *)&dst->rule + sizeof(data->rulenum), &set, sizeof(set)); dst->state = data->state; if (data->flags & DYN_REFERENCED) dst->state |= IPFW_DYN_ORPHANED; /* unused fields */ dst->parent = NULL; dst->ack_fwd = data->ack_fwd; dst->ack_rev = data->ack_rev; dst->count = 0; dst->bucket = data->hashval; /* * The legacy userland code will interpret a NULL here as a marker * for the last dynamic rule. */ dst->next = (ipfw_dyn_rule *)1; } static void dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst) { struct ip_fw *rule; switch (s->type) { case O_LIMIT_PARENT: rule = s->limit->parent; dyn_export_parent(s->limit, s->kidx, rule->set, dst); break; default: rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv4_state *)rule)->limit->parent; dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); } dst->id.dst_ip = s->dst; dst->id.src_ip = s->src; dst->id.dst_port = s->dport; dst->id.src_port = s->sport; dst->id.fib = s->data->fibnum; dst->id.proto = s->proto; dst->id._flags = 0; dst->id.addr_type = 4; memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6)); memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6)); dst->id.flow_id6 = dst->id.extra = 0; } #ifdef INET6 static void dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst) { struct ip_fw *rule; switch (s->type) { case O_LIMIT_PARENT: rule = s->limit->parent; dyn_export_parent(s->limit, s->kidx, rule->set, dst); break; default: rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv6_state *)rule)->limit->parent; dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); } dst->id.src_ip6 = s->src; dst->id.dst_ip6 = s->dst; dst->id.dst_port = s->dport; dst->id.src_port = s->sport; dst->id.fib = s->data->fibnum; dst->id.proto = s->proto; dst->id._flags = 0; dst->id.addr_type = 6; dst->id.dst_ip = dst->id.src_ip = 0; dst->id.flow_id6 = dst->id.extra = 0; } #endif /* INET6 */ /* * Fills the buffer given by @sd with dynamic states. * Used by dump format v1 (current). * * Returns 0 on success. */ int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; ipfw_obj_dyntlv *dst, *last; ipfw_obj_ctlv *ctlv; uint32_t bucket; if (V_dyn_count == 0) return (0); /* * IPFW_UH_RLOCK garantees that another userland request * and callout thread will not delete entries from states * lists. */ IPFW_UH_RLOCK_ASSERT(chain); ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; ctlv->objsize = sizeof(ipfw_obj_dyntlv); last = NULL; #define DYN_EXPORT_STATES(s, af, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, \ sizeof(ipfw_obj_dyntlv)); \ if (dst == NULL) \ return (ENOMEM); \ dyn_export_ ## af ## _state(s, &dst->state); \ dst->head.length = sizeof(ipfw_obj_dyntlv); \ dst->head.type = IPFW_TLV_DYN_ENT; \ last = dst; \ } for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); #ifdef INET6 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); #endif /* INET6 */ } /* mark last dynamic rule */ if (last != NULL) last->head.flags = IPFW_DF_LAST; /* XXX: unused */ return (0); #undef DYN_EXPORT_STATES } /* * Fill given buffer with dynamic states (legacy format). * IPFW_UH_RLOCK has to be held while calling. */ void ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; ipfw_dyn_rule *p, *last = NULL; char *bp; uint32_t bucket; if (V_dyn_count == 0) return; bp = *pbp; IPFW_UH_RLOCK_ASSERT(chain); #define DYN_EXPORT_STATES(s, af, head, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## head[b], entry) { \ if (bp + sizeof(*p) > ep) \ break; \ p = (ipfw_dyn_rule *)bp; \ dyn_export_ ## af ## _state(s, p); \ last = p; \ bp += sizeof(*p); \ } for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); #ifdef INET6 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); #endif /* INET6 */ } if (last != NULL) /* mark last dynamic rule */ last->next = NULL; *pbp = bp; #undef DYN_EXPORT_STATES } void ipfw_dyn_init(struct ip_fw_chain *chain) { #ifdef IPFIREWALL_JENKINSHASH V_dyn_hashseed = arc4random(); #endif V_dyn_max = 16384; /* max # of states */ V_dyn_parent_max = 4096; /* max # of parent states */ V_dyn_buckets_max = 8192; /* must be power of 2 */ V_dyn_ack_lifetime = 300; V_dyn_syn_lifetime = 20; V_dyn_fin_lifetime = 1; V_dyn_rst_lifetime = 1; V_dyn_udp_lifetime = 10; V_dyn_short_lifetime = 5; V_dyn_keepalive_interval = 20; V_dyn_keepalive_period = 5; V_dyn_keepalive = 1; /* send keepalives */ V_dyn_keepalive_last = time_uptime; V_dyn_data_zone = uma_zcreate("IPFW dynamic states data", sizeof(struct dyn_data), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_dyn_data_zone, V_dyn_max); V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states", sizeof(struct dyn_parent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); SLIST_INIT(&V_dyn_expired_ipv4); V_dyn_ipv4 = NULL; V_dyn_ipv4_parent = NULL; V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states", sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef INET6 SLIST_INIT(&V_dyn_expired_ipv6); V_dyn_ipv6 = NULL; V_dyn_ipv6_parent = NULL; V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states", sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #endif /* Initialize buckets. */ V_curr_dyn_buckets = 0; V_dyn_bucket_lock = NULL; dyn_grow_hashtable(chain, 256); if (IS_DEFAULT_VNET(curvnet)) dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); DYN_EXPIRED_LOCK_INIT(); callout_init(&V_dyn_timeout, 1); callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet); IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); } void ipfw_dyn_uninit(int pass) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; int bucket; if (pass == 0) { callout_drain(&V_dyn_timeout); return; } IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); DYN_EXPIRED_LOCK_DESTROY(); #define DYN_FREE_STATES_FORCED(CK, s, af, name, en) do { \ while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) { \ CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en); \ if (s->type == O_LIMIT_PARENT) \ uma_zfree(V_dyn_parent_zone, s->limit); \ else \ uma_zfree(V_dyn_data_zone, s->data); \ uma_zfree(V_dyn_ ## af ## _zone, s); \ } \ } while (0) for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket); DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry); DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket], entry); #ifdef INET6 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry); DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket], entry); #endif /* INET6 */ } DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired); #ifdef INET6 DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired); #endif #undef DYN_FREE_STATES_FORCED uma_zdestroy(V_dyn_ipv4_zone); uma_zdestroy(V_dyn_data_zone); uma_zdestroy(V_dyn_parent_zone); #ifdef INET6 uma_zdestroy(V_dyn_ipv6_zone); free(V_dyn_ipv6, M_IPFW); free(V_dyn_ipv6_parent, M_IPFW); free(V_dyn_ipv6_add, M_IPFW); free(V_dyn_ipv6_parent_add, M_IPFW); free(V_dyn_ipv6_del, M_IPFW); free(V_dyn_ipv6_parent_del, M_IPFW); #endif free(V_dyn_bucket_lock, M_IPFW); free(V_dyn_ipv4, M_IPFW); free(V_dyn_ipv4_parent, M_IPFW); free(V_dyn_ipv4_add, M_IPFW); free(V_dyn_ipv4_parent_add, M_IPFW); free(V_dyn_ipv4_del, M_IPFW); free(V_dyn_ipv4_parent_del, M_IPFW); if (IS_DEFAULT_VNET(curvnet)) free(dyn_hp_cache, M_IPFW); } Index: head/sys/netpfil/ipfw/ip_fw_log.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_log.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw_log.c (revision 345162) @@ -1,426 +1,421 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Logging support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include /* ip6_sprintf() */ #endif #include #ifdef MAC #include #endif /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) #ifdef __APPLE__ #undef snprintf #define snprintf sprintf #define SNPARGS(buf, len) buf + len #define SNP(buf) buf #else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) #endif /* !__APPLE__ */ #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, - struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + struct ip_fw_args *args, struct mbuf *m, u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[92], proto[128], fragment[32]; if (V_fw_verbose == 0) { if (args->flags & IPFW_ARGS_ETHER) /* layer2, use orig hdr */ ipfw_bpf_mtap2(args->eh, ETHER_HDR_LEN, m); else { /* Add fake header. Later we will store * more info in the header. */ if (ip->ip_v == 4) ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); else if (ip->ip_v == 6) ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m); else /* Obviously bogus EtherType. */ ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m); } return; } /* the old 'log' function */ fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; V_norule_counter++; if (V_norule_counter == V_verbose_limit) limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB || cmd->opcode == O_TAG || cmd->opcode == O_SETDSCP) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_REJECT_ABORT) action = "Abort"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else if (cmd->arg1==ICMP6_UNREACH_ABORT) action = "Abort"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", TARG(cmd->arg1, divert)); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", TARG(cmd->arg1, divert)); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", TARG(cmd->arg1, fib) & 0x7FFF); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", TARG(cmd->arg1, skipto)); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", TARG(cmd->arg1, pipe)); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", TARG(cmd->arg1, pipe)); break; case O_FORWARD_IP: { char buf[INET_ADDRSTRLEN]; ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa_r(dummyaddr, buf)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; #ifdef INET6 case O_FORWARD_IP6: { char buf[INET6_ADDRSTRLEN]; ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", ip6_sprintf(buf, &sa->sa.sin6_addr)); if (sa->sa.sin6_port) snprintf(SNPARGS(action2, len), ":%u", sa->sa.sin6_port); } break; #endif case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; case O_REASS: action = "Reass"; break; case O_CALLRETURN: if (cmd->len & F_NOT) action = "Return"; else snprintf(SNPARGS(action2, 0), "Call %d", cmd->arg1); break; case O_EXTERNAL_ACTION: snprintf(SNPARGS(action2, 0), "Eaction %s", ((struct named_object *)SRV_OBJECT(chain, cmd->arg1))->name); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; u_short ip6f_mf; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 ip6f_mf = offset & IP6F_MORE_FRAG; offset &= IP6F_OFF_MASK; if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: case IPPROTO_UDPLITE: len = snprintf(SNPARGS(proto, 0), "UDP%s%s", args->f_id.proto == IPPROTO_UDP ? " ": "Lite ", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset || ip6f_mf) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, ntohs(ip6->ip6_plen) - hlen, ntohs(offset) << 3, ip6f_mf ? "+" : ""); } else #endif { int ipoff, iplen; ipoff = ntohs(ip->ip_off); iplen = ntohs(ip->ip_len); if (ipoff & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), offset << 3, (ipoff & IP_MF) ? "+" : ""); } } #ifdef __FreeBSD__ - if (oif || m->m_pkthdr.rcvif) - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s %s via %s%s\n", - f ? f->rulenum : -1, - action, proto, oif ? "out" : "in", - oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, - fragment); - else + log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, action, proto, + args->flags & IPFW_ARGS_OUT ? "out" : "in", args->ifp->if_xname, + fragment); +#else + log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, action, proto, fragment); #endif - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s [no if info]%s\n", - f ? f->rulenum : -1, - action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* end of file */ Index: head/sys/netpfil/ipfw/ip_fw_nat.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_nat.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw_nat.c (revision 345162) @@ -1,1242 +1,1243 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Paolo Pisati * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for in_cksum */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; uint16_t port; }; /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ uint16_t mode; /* type of redirect mode */ uint16_t proto; /* protocol: tcp/udp */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ uint16_t lport; /* local port */ uint16_t pport; /* public port */ uint16_t rport; /* remote port */ uint16_t pport_cnt; /* number of public ports */ uint16_t rport_cnt; /* number of remote ports */ struct alias_link **alink; u_int16_t spool_cnt; /* num of entry in spool chain */ /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ struct libalias *lib; /* libalias instance */ int mode; /* aliasing mode */ int redir_cnt; /* number of entry in spool chain */ /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; char if_name[IF_NAMESIZE]; /* interface name */ }; static eventhandler_tag ifaddr_event_tag; static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; struct ip_fw_chain *chain; KASSERT(curvnet == ifp->if_vnet, ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet)); if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0) return; chain = &V_layer3_chain; IPFW_UH_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) continue; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; IPFW_WLOCK(chain); ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); IPFW_WUNLOCK(chain); } if_addr_runlock(ifp); } IPFW_UH_WUNLOCK(chain); } /* * delete the pointers for nat entry ix, or all of them if ix < 0 */ static void flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { int i; ipfw_insn_nat *cmd; IPFW_WLOCK_ASSERT(chain); for (i = 0; i < chain->n_rules; i++) { cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); /* XXX skip log and the like ? */ if (cmd->o.opcode == O_NAT && cmd->nat != NULL && (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case NAT44_REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case NAT44_REDIR_ADDR: case NAT44_REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct nat44_cfg_redir *)&buf[off]; r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO); r->mode = ser_r->mode; r->laddr = ser_r->laddr; r->paddr = ser_r->paddr; r->raddr = ser_r->raddr; r->lport = ser_r->lport; r->pport = ser_r->pport; r->rport = ser_r->rport; r->pport_cnt = ser_r->pport_cnt; r->rport_cnt = ser_r->rport_cnt; r->proto = ser_r->proto; r->spool_cnt = ser_r->spool_cnt; //memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += sizeof(struct nat44_cfg_redir); r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case NAT44_REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case NAT44_REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case NAT44_REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } if (r->alink[0] == NULL) { printf("LibAliasRedirect* returned NULL\n"); free(r->alink, M_IPFW); free(r, M_IPFW); return (EINVAL); } /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct nat44_cfg_spool *)&buf[off]; s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO); s->addr = ser_s->addr; s->port = ser_s->port; LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += sizeof(struct nat44_cfg_spool); /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (0); } static void free_nat_instance(struct cfg_nat *ptr) { del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } /* * ipfw_nat - perform mbuf header translation. * * Note V_layer3_chain has to be locked while calling ipfw_nat() in * 'global' operation mode (t == NULL). * */ static int ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) { struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ int ldt, retval, found; struct ip_fw_chain *chain; char *c; ldt = 0; retval = 0; mcl = m_megapullup(m, m->m_pkthdr.len); if (mcl == NULL) { args->m = NULL; return (IP_FW_DENY); } ip = mtod(mcl, struct ip *); /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only pseudo-header checksum * calculated and libalias will break it[1], so mark them for * later fix. Moreover there are cases when libalias modifies * tcp packet data[2], mark them for later fix too. * * [1] libalias was never meant to run in kernel, so it does * not have any knowledge about checksum offloading, and * expects a packet with a full internet checksum. * Unfortunately, packets generated locally will have just the * pseudo header calculated, and when libalias tries to adjust * the checksum it will actually compute a wrong value. * * [2] when libalias modifies tcp's data content, full TCP * checksum has to be recomputed: the problem is that * libalias does not have any idea about checksum offloading. * To work around this, we do not do checksumming in LibAlias, * but only mark the packets in th_x2 field. If we receive a * marked packet, we calculate correct checksum for it * aware of offloading. Why such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will hide ALL * transmission errors. Yes, marked packets still suffer from * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); /* Check if this is 'global' instance */ if (t == NULL) { - if (args->oif == NULL) { + if (args->flags & IPFW_ARGS_IN) { /* Wrong direction, skip processing */ args->m = mcl; return (IP_FW_NAT); } found = 0; chain = &V_layer3_chain; IPFW_RLOCK_ASSERT(chain); /* Check every nat entry... */ LIST_FOREACH(t, &chain->nat, _next) { if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) continue; retval = LibAliasOutTry(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl), 0); if (retval == PKT_ALIAS_OK) { /* Nat instance recognises state */ found = 1; break; } } if (found != 1) { /* No instance found, return ignore */ args->m = mcl; return (IP_FW_NAT); } } else { - if (args->oif == NULL) + if (args->flags & IPFW_ARGS_IN) retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); } /* * We drop packet when: * 1. libalias returns PKT_ALIAS_ERROR; * 2. For incoming packets: * a) for unresolved fragments; * b) libalias returns PKT_ALIAS_IGNORED and * PKT_ALIAS_DENY_INCOMING flag is set. */ if (retval == PKT_ALIAS_ERROR || - (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || + ((args->flags & IPFW_ARGS_IN) && + (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (retval == PKT_ALIAS_IGNORED && (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } if (retval == PKT_ALIAS_RESPOND) mcl->m_flags |= M_SKIP_FIREWALL; mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (th->th_x2) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; uint16_t ip_len, cksum; ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ th->th_x2 = 0; th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* No hw checksum offloading: do it ourselves */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } } args->m = mcl; return (IP_FW_NAT); } static struct cfg_nat * lookup_nat(struct nat_list *l, int nat_id) { struct cfg_nat *res; LIST_FOREACH(res, l, _next) { if (res->id == nat_id) break; } return res; } static struct cfg_nat * lookup_nat_name(struct nat_list *l, char *name) { struct cfg_nat *res; int id; char *errptr; id = strtol(name, &errptr, 10); if (id == 0 || *errptr != '\0') return (NULL); LIST_FOREACH(res, l, _next) { if (res->id == id) break; } return (res); } /* IP_FW3 configuration routines */ static void nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) { struct cfg_nat *ptr, *tcfg; int gencnt; /* * Find/create nat rule. */ IPFW_UH_WLOCK(chain); gencnt = chain->gencnt; ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); ptr->lib = LibAliasInit(NULL); LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarily unhook it. */ IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Basic nat (re)configuration. */ ptr->id = strtol(ucfg->name, NULL, 10); /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ucfg->ip; ptr->redir_cnt = ucfg->redir_cnt; ptr->mode = ucfg->mode; strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name)); LibAliasSetMode(ptr->lib, ptr->mode, ~0); LibAliasSetAddress(ptr->lib, ptr->ip); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg((char *)(ucfg + 1), ptr); IPFW_UH_WLOCK(chain); /* Extra check to avoid race with another ipfw_nat_cfg() */ tcfg = NULL; if (gencnt != chain->gencnt) tcfg = lookup_nat_name(&chain->nat, ucfg->name); IPFW_WLOCK(chain); if (tcfg != NULL) LIST_REMOVE(tcfg, _next); LIST_INSERT_HEAD(&chain->nat, ptr, _next); IPFW_WUNLOCK(chain); chain->gencnt++; IPFW_UH_WUNLOCK(chain); if (tcfg != NULL) free_nat_instance(ptr); } /* * Creates/configure nat44 instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; int id; size_t read; char *errptr; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg))) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated and looks like number */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); id = strtol(ucfg->name, &errptr, 10); if (id == 0 || *errptr != '\0') return (EINVAL); read = sizeof(*oh) + sizeof(*ucfg); /* Check number of redirs */ if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir)) return (EINVAL); nat44_config(chain, ucfg); return (0); } /* * Destroys given nat instances. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct cfg_nat *ptr; ipfw_obj_ntlv *ntlv; /* Check minimum header size */ if (sd->valsize < sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ntlv = &oh->ntlv; /* Check if name is properly terminated */ if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name)) return (EINVAL); IPFW_UH_WLOCK(chain); ptr = lookup_nat_name(&chain->nat, ntlv->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (ESRCH); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static void export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg) { snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id); ucfg->ip = ptr->ip; ucfg->redir_cnt = ptr->redir_cnt; ucfg->mode = ptr->mode; strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name)); } /* * Gets config for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat); LIST_FOREACH(r, &ptr->redir_chain, _next) { sz += sizeof(struct nat44_cfg_redir); LIST_FOREACH(s, &r->spool_chain, _next) sz += sizeof(struct nat44_cfg_spool); } ucfg->size = sz; if (sd->valsize < sz) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } /* Size OK, let's copy data */ LIST_FOREACH(r, &ptr->redir_chain, _next) { ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd, sizeof(*ser_r)); ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space( sd, sizeof(*ser_s)); ser_s->addr = s->addr; ser_s->port = s->port; } } IPFW_UH_RUNLOCK(chain); return (0); } /* * Lists all nat44 instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ] * * Returns 0 on success */ static int nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; int nat_count; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(chain); nat_count = 0; LIST_FOREACH(ptr, &chain->nat, _next) nat_count++; olh->count = nat_count; olh->objsize = sizeof(struct nat44_cfg_nat); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } LIST_FOREACH(ptr, &chain->nat, _next) { ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd, sizeof(*ucfg)); export_nat_cfg(ptr, ucfg); } IPFW_UH_RUNLOCK(chain); return (0); } /* * Gets log for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat ] * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ] * * Returns 0 on success */ static int nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; void *pbuf; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } if (ptr->lib->logDesc == NULL) { IPFW_UH_RUNLOCK(chain); return (ENOENT); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE; if (sd->valsize < sz + sizeof(*oh)) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE); memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE); IPFW_UH_RUNLOCK(chain); return (0); } static struct ipfw_sopt_handler scodes[] = { { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg }, { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy }, { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg }, { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat }, { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log }, }; /* * Legacy configuration routines */ struct cfg_spool_legacy { LIST_ENTRY(cfg_spool_legacy) _next; struct in_addr addr; u_short port; }; struct cfg_redir_legacy { LIST_ENTRY(cfg_redir) _next; u_int16_t mode; struct in_addr laddr; struct in_addr paddr; struct in_addr raddr; u_short lport; u_short pport; u_short rport; u_short pport_cnt; u_short rport_cnt; int proto; struct alias_link **alink; u_int16_t spool_cnt; LIST_HEAD(, cfg_spool_legacy) spool_chain; }; struct cfg_nat_legacy { LIST_ENTRY(cfg_nat_legacy) _next; int id; struct in_addr ip; char if_name[IF_NAMESIZE]; int mode; struct libalias *lib; int redir_cnt; LIST_HEAD(, cfg_redir_legacy) redir_chain; }; static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat_legacy *cfg; struct nat44_cfg_nat *ucfg; struct cfg_redir_legacy *rdir; struct nat44_cfg_redir *urdir; char *buf; size_t len, len2; int error, i; len = sopt->sopt_valsize; len2 = len + 128; /* * Allocate 2x buffer to store converted structures. * new redir_cfg has shrunk, so we're sure that * new buffer size is enough. */ buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy)); if (error != 0) goto out; cfg = (struct cfg_nat_legacy *)buf; if (cfg->id < 0) { error = EINVAL; goto out; } ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)]; snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id); strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name)); ucfg->ip = cfg->ip; ucfg->mode = cfg->mode; ucfg->redir_cnt = cfg->redir_cnt; if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) { error = EINVAL; goto out; } urdir = (struct nat44_cfg_redir *)(ucfg + 1); rdir = (struct cfg_redir_legacy *)(cfg + 1); for (i = 0; i < cfg->redir_cnt; i++) { urdir->mode = rdir->mode; urdir->laddr = rdir->laddr; urdir->paddr = rdir->paddr; urdir->raddr = rdir->raddr; urdir->lport = rdir->lport; urdir->pport = rdir->pport; urdir->rport = rdir->rport; urdir->pport_cnt = rdir->pport_cnt; urdir->rport_cnt = rdir->rport_cnt; urdir->proto = rdir->proto; urdir->spool_cnt = rdir->spool_cnt; urdir++; rdir++; } nat44_config(&V_layer3_chain, ucfg); out: free(buf, M_TEMP); return (error); } static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; int i; sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (EINVAL); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static int ipfw_nat_get_cfg(struct sockopt *sopt) { struct ip_fw_chain *chain = &V_layer3_chain; struct cfg_nat *n; struct cfg_nat_legacy *ucfg; struct cfg_redir *r; struct cfg_spool *s; struct cfg_redir_legacy *ser_r; struct cfg_spool_legacy *ser_s; char *data; int gencnt, nat_cnt, len, error; nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); retry: gencnt = chain->gencnt; /* Estimate memory amount */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) len += sizeof(struct cfg_spool_legacy); } } IPFW_UH_RUNLOCK(chain); data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); bcopy(&nat_cnt, data, sizeof(nat_cnt)); nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); if (gencnt != chain->gencnt) { free(data, M_TEMP); goto retry; } /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { ucfg = (struct cfg_nat_legacy *)&data[len]; ucfg->id = n->id; ucfg->ip = n->ip; ucfg->redir_cnt = n->redir_cnt; ucfg->mode = n->mode; strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name)); len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { ser_r = (struct cfg_redir_legacy *)&data[len]; ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct cfg_spool_legacy *)&data[len]; ser_s->addr = s->addr; ser_s->port = s->port; len += sizeof(struct cfg_spool_legacy); } } } IPFW_UH_RUNLOCK(chain); error = sooptcopyout(sopt, data, len); free(data, M_TEMP); return (error); } static int ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; IPFW_RLOCK(chain); /* one pass to count, one to copy the data */ i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; i++; } size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(chain); return (ENOSPC); } i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); } static int vnet_ipfw_nat_init(const void *arg __unused) { V_ipfw_nat_ready = 1; return (0); } static int vnet_ipfw_nat_uninit(const void *arg __unused) { struct cfg_nat *ptr, *ptr_temp; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_WLOCK(chain); V_ipfw_nat_ready = 0; LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); free_nat_instance(ptr); } flush_nat_ptrs(chain, -1 /* flush all */); IPFW_WUNLOCK(chain); return (0); } static void ipfw_nat_init(void) { /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_ADD_SOPT_HANDLER(1, scodes); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); /* deregister ipfw_nat */ IPFW_DEL_SOPT_HANDLER(1, scodes); ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; ipfw_nat_del_ptr = NULL; ipfw_nat_get_cfg_ptr = NULL; ipfw_nat_get_log_ptr = NULL; } static int ipfw_nat_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: break; default: return EOPNOTSUPP; break; } return err; } static moduledata_t ipfw_nat_mod = { "ipfw_nat", ipfw_nat_modevent, 0 }; /* Define startup order. */ #define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ #define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1) #define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2) DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3); MODULE_VERSION(ipfw_nat, 1); SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_init, NULL); VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_init, NULL); SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL); /* end of file */ Index: head/sys/netpfil/ipfw/ip_fw_pfil.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_pfil.c (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw_pfil.c (revision 345162) @@ -1,667 +1,667 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include VNET_DEFINE_STATIC(int, fw_enable) = 1; #define V_fw_enable VNET(fw_enable) #ifdef INET6 VNET_DEFINE_STATIC(int, fw6_enable) = 1; #define V_fw6_enable VNET(fw6_enable) #endif VNET_DEFINE_STATIC(int, fwlink_enable) = 0; #define V_fwlink_enable VNET(fwlink_enable) int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); /* Forward declarations. */ static int ipfw_divert(struct mbuf **, bool, struct ipfw_rule_ref *, int); #ifdef SYSCTL_NODE SYSBEGIN(f1) SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6_fw); SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ SYSCTL_DECL(_net_link_ether); SYSCTL_PROC(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0, ipfw_chg_hook, "I", "Pass ether pkts through firewall"); SYSEND #endif /* SYSCTL_NODE */ /* * The pfilter hook to pass packets to ipfw_chk and then to * dummynet, divert, netgraph or other modules. * The packet may be consumed. */ static pfil_return_t -ipfw_check_packet(struct mbuf **m0, struct ifnet *ifp, int dir, +ipfw_check_packet(struct mbuf **m0, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct ip_fw_args args; struct m_tag *tag; pfil_return_t ret; - int ipfw; + int ipfw, dir; - /* convert dir to IPFW values */ - dir = (dir & PFIL_IN) ? DIR_IN : DIR_OUT; - args.flags = 0; + args.flags = (flags & PFIL_IN) ? IPFW_ARGS_IN : IPFW_ARGS_OUT; + dir = (flags & PFIL_IN) ? DIR_IN : DIR_OUT; again: /* * extract and remove the tag if present. If we are left * with onepass, optimize the outgoing path. */ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (tag != NULL) { args.rule = *((struct ipfw_rule_ref *)(tag+1)); m_tag_delete(*m0, tag); if (args.rule.info & IPFW_ONEPASS) return (0); args.flags |= IPFW_ARGS_REF; } args.m = *m0; - args.oif = dir == DIR_OUT ? ifp : NULL; + args.ifp = ifp; args.inp = inp; ipfw = ipfw_chk(&args); *m0 = args.m; KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", __func__)); ret = PFIL_PASS; switch (ipfw) { case IP_FW_PASS: /* next_hop may be set by ipfw_chk */ if ((args.flags & (IPFW_ARGS_NH4 | IPFW_ARGS_NH4PTR | IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)) == 0) break; #if (!defined(INET6) && !defined(INET)) ret = PFIL_DROPPED; #else { void *psa; size_t len; #ifdef INET if (args.flags & (IPFW_ARGS_NH4 | IPFW_ARGS_NH4PTR)) { MPASS((args.flags & (IPFW_ARGS_NH4 | IPFW_ARGS_NH4PTR)) != (IPFW_ARGS_NH4 | IPFW_ARGS_NH4PTR)); MPASS((args.flags & (IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)) == 0); len = sizeof(struct sockaddr_in); psa = (args.flags & IPFW_ARGS_NH4) ? &args.hopstore : args.next_hop; if (in_localip(satosin(psa)->sin_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; (*m0)->m_flags |= M_IP_NEXTHOP; } #endif /* INET */ #ifdef INET6 if (args.flags & (IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)) { MPASS((args.flags & (IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)) != (IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)); MPASS((args.flags & (IPFW_ARGS_NH4 | IPFW_ARGS_NH4PTR)) == 0); len = sizeof(struct sockaddr_in6); psa = args.next_hop6; (*m0)->m_flags |= M_IP6_NEXTHOP; } #endif /* INET6 */ /* * Incoming packets should not be tagged so we do not * m_tag_find. Outgoing packets may be tagged, so we * reuse the tag if present. */ - tag = (dir == DIR_IN) ? NULL : + tag = (flags & PFIL_IN) ? NULL : m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); if (tag != NULL) { m_tag_unlink(*m0, tag); } else { tag = m_tag_get(PACKET_TAG_IPFORWARD, len, M_NOWAIT); if (tag == NULL) { ret = PFIL_DROPPED; break; } } if ((args.flags & IPFW_ARGS_NH6) == 0) bcopy(psa, tag + 1, len); m_tag_prepend(*m0, tag); ret = 0; #ifdef INET6 /* IPv6 next hop needs additional handling */ if (args.flags & (IPFW_ARGS_NH6 | IPFW_ARGS_NH6PTR)) { struct sockaddr_in6 *sa6; sa6 = satosin6(tag + 1); if (args.flags & IPFW_ARGS_NH6) { sa6->sin6_family = AF_INET6; sa6->sin6_len = sizeof(*sa6); sa6->sin6_addr = args.hopstore6.sin6_addr; sa6->sin6_port = args.hopstore6.sin6_port; sa6->sin6_scope_id = args.hopstore6.sin6_scope_id; } /* * If nh6 address is link-local we should convert * it to kernel internal form before doing any * comparisons. */ if (sa6_embedscope(sa6, V_ip6_use_defzone) != 0) { ret = PFIL_DROPPED; break; } if (in6_localip(&sa6->sin6_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; } #endif /* INET6 */ } #endif /* INET || INET6 */ break; case IP_FW_DENY: ret = PFIL_DROPPED; break; case IP_FW_DUMMYNET: if (ip_dn_io_ptr == NULL) { ret = PFIL_DROPPED; break; } MPASS(args.flags & IPFW_ARGS_REF); if (mtod(*m0, struct ip *)->ip_v == 4) (void )ip_dn_io_ptr(m0, dir, &args); else if (mtod(*m0, struct ip *)->ip_v == 6) (void )ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); else { ret = PFIL_DROPPED; break; } /* * XXX should read the return value. * dummynet normally eats the packet and sets *m0=NULL * unless the packet can be sent immediately. In this * case args is updated and we should re-run the * check without clearing args. */ if (*m0 != NULL) goto again; ret = PFIL_CONSUMED; break; case IP_FW_TEE: case IP_FW_DIVERT: if (ip_divert_ptr == NULL) { ret = PFIL_DROPPED; break; } MPASS(args.flags & IPFW_ARGS_REF); (void )ipfw_divert(m0, dir == DIR_IN, &args.rule, (ipfw == IP_FW_TEE) ? 1 : 0); /* continue processing for the original packet (tee). */ if (*m0) goto again; ret = PFIL_CONSUMED; break; case IP_FW_NGTEE: case IP_FW_NETGRAPH: if (ng_ipfw_input_p == NULL) { ret = PFIL_DROPPED; break; } MPASS(args.flags & IPFW_ARGS_REF); (void )ng_ipfw_input_p(m0, dir, &args, (ipfw == IP_FW_NGTEE) ? 1 : 0); if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ goto again; /* continue with packet */ ret = PFIL_CONSUMED; break; case IP_FW_NAT: /* honor one-pass in case of successful nat */ if (V_fw_one_pass) break; goto again; case IP_FW_REASS: goto again; /* continue with packet */ case IP_FW_NAT64: ret = PFIL_CONSUMED; break; default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != PFIL_PASS) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } return (ret); } /* * ipfw processing for ethernet packets (in and out). */ static pfil_return_t ipfw_check_frame(struct mbuf **m0, struct ifnet *ifp, int dir, void *ruleset __unused, struct inpcb *inp) { struct ip_fw_args args; struct ether_header save_eh; struct ether_header *eh; struct m_tag *mtag; struct mbuf *m; pfil_return_t ret; int i; args.flags = IPFW_ARGS_ETHER; + args.flags |= (dir & PFIL_IN) ? IPFW_ARGS_IN : IPFW_ARGS_OUT; again: /* fetch start point from rule, if any. remove the tag if present. */ mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (mtag != NULL) { args.rule = *((struct ipfw_rule_ref *)(mtag+1)); m_tag_delete(*m0, mtag); if (args.rule.info & IPFW_ONEPASS) return (0); args.flags |= IPFW_ARGS_REF; } /* I need some amt of data to be contiguous */ m = *m0; i = min(m->m_pkthdr.len, max_protohdr); if (m->m_len < i) { m = m_pullup(m, i); if (m == NULL) { *m0 = m; return (0); } } eh = mtod(m, struct ether_header *); save_eh = *eh; /* save copy for restore below */ m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ args.m = m; /* the packet we are looking at */ - args.oif = dir & PFIL_OUT ? ifp: NULL; /* destination, if any */ + args.ifp = ifp; args.eh = &save_eh; /* MAC header for bridged/MAC packets */ args.inp = inp; /* used by ipfw uid/gid/jail rules */ i = ipfw_chk(&args); m = args.m; if (m != NULL) { /* * Restore Ethernet header, as needed, in case the * mbuf chain was replaced by ipfw. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) { *m0 = NULL; return (0); } if (eh != mtod(m, struct ether_header *)) bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } *m0 = m; ret = PFIL_PASS; /* Check result of ipfw_chk() */ switch (i) { case IP_FW_PASS: break; case IP_FW_DENY: ret = PFIL_DROPPED; break; case IP_FW_DUMMYNET: if (ip_dn_io_ptr == NULL) { ret = PFIL_DROPPED; break; } *m0 = NULL; dir = (dir & PFIL_IN) ? DIR_IN : DIR_OUT; MPASS(args.flags & IPFW_ARGS_REF); ip_dn_io_ptr(&m, dir | PROTO_LAYER2, &args); return (PFIL_CONSUMED); case IP_FW_NGTEE: case IP_FW_NETGRAPH: if (ng_ipfw_input_p == NULL) { ret = PFIL_DROPPED; break; } MPASS(args.flags & IPFW_ARGS_REF); (void )ng_ipfw_input_p(m0, (dir & PFIL_IN) ? DIR_IN : DIR_OUT, &args, (i == IP_FW_NGTEE) ? 1 : 0); if (i == IP_FW_NGTEE) /* ignore errors for NGTEE */ goto again; /* continue with packet */ ret = PFIL_CONSUMED; break; default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != PFIL_PASS) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } return (ret); } /* do the divert, return 1 on error 0 on success */ static int ipfw_divert(struct mbuf **m0, bool incoming, struct ipfw_rule_ref *rule, int tee) { /* * ipfw_chk() has already tagged the packet with the divert tag. * If tee is set, copy packet and return original. * If not tee, consume packet and send it to divert socket. */ struct mbuf *clone; struct ip *ip = mtod(*m0, struct ip *); struct m_tag *tag; /* Cloning needed for tee? */ if (tee == 0) { clone = *m0; /* use the original mbuf */ *m0 = NULL; } else { clone = m_dup(*m0, M_NOWAIT); /* If we cannot duplicate the mbuf, we sacrifice the divert * chain and continue with the tee-ed packet. */ if (clone == NULL) return 1; } /* * Divert listeners can normally handle non-fragmented packets, * but we can only reass in the non-tee case. * This means that listeners on a tee rule may get fragments, * and have to live with that. * Note that we now have the 'reass' ipfw option so if we care * we can do it before a 'tee'. */ if (!tee) switch (ip->ip_v) { case IPVERSION: if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { int hlen; struct mbuf *reass; reass = ip_reass(clone); /* Reassemble packet. */ if (reass == NULL) return 0; /* not an error */ /* if reass = NULL then it was consumed by ip_reass */ /* * IP header checksum fixup after reassembly and leave header * in network byte order. */ ip = mtod(reass, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(reass, hlen); clone = reass; } break; #ifdef INET6 case IPV6_VERSION >> 4: { struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { int nxt, off; off = sizeof(struct ip6_hdr); nxt = frag6_input(&clone, &off, 0); if (nxt == IPPROTO_DONE) return (0); } break; } #endif } /* attach a tag to the packet with the reinject info */ tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT); if (tag == NULL) { FREE_PKT(clone); return 1; } *((struct ipfw_rule_ref *)(tag+1)) = *rule; m_tag_prepend(clone, tag); /* Do the dirty job... */ ip_divert_ptr(clone, incoming); return 0; } /* * attach or detach hooks for a given protocol family */ VNET_DEFINE_STATIC(pfil_hook_t, ipfw_inet_hook); #define V_ipfw_inet_hook VNET(ipfw_inet_hook) #ifdef INET6 VNET_DEFINE_STATIC(pfil_hook_t, ipfw_inet6_hook); #define V_ipfw_inet6_hook VNET(ipfw_inet6_hook) #endif VNET_DEFINE_STATIC(pfil_hook_t, ipfw_link_hook); #define V_ipfw_link_hook VNET(ipfw_link_hook) static int ipfw_hook(int onoff, int pf) { struct pfil_hook_args pha; struct pfil_link_args pla; pfil_hook_t *h; pha.pa_version = PFIL_VERSION; pha.pa_flags = PFIL_IN | PFIL_OUT; pha.pa_modname = "ipfw"; pha.pa_ruleset = NULL; pla.pa_version = PFIL_VERSION; pla.pa_flags = PFIL_IN | PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR; switch (pf) { case AF_INET: pha.pa_func = ipfw_check_packet; pha.pa_type = PFIL_TYPE_IP4; pha.pa_rulname = "default"; h = &V_ipfw_inet_hook; pla.pa_head = V_inet_pfil_head; break; #ifdef INET6 case AF_INET6: pha.pa_func = ipfw_check_packet; pha.pa_type = PFIL_TYPE_IP6; pha.pa_rulname = "default6"; h = &V_ipfw_inet6_hook; pla.pa_head = V_inet6_pfil_head; break; #endif case AF_LINK: pha.pa_func = ipfw_check_frame; pha.pa_type = PFIL_TYPE_ETHERNET; pha.pa_rulname = "default-link"; h = &V_ipfw_link_hook; pla.pa_head = V_link_pfil_head; break; } if (onoff) { *h = pfil_add_hook(&pha); pla.pa_hook = *h; (void)pfil_link(&pla); } else if (*h != NULL) pfil_remove_hook(*h); return 0; } int ipfw_attach_hooks(int arg) { int error = 0; if (arg == 0) /* detach */ ipfw_hook(0, AF_INET); else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ printf("ipfw_hook() error\n"); } #ifdef INET6 if (arg == 0) /* detach */ ipfw_hook(0, AF_INET6); else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { error = ENOENT; printf("ipfw6_hook() error\n"); } #endif if (arg == 0) /* detach */ ipfw_hook(0, AF_LINK); else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { error = ENOENT; printf("ipfw_link_hook() error\n"); } return error; } int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { int newval; int error; int af; if (arg1 == &V_fw_enable) af = AF_INET; #ifdef INET6 else if (arg1 == &V_fw6_enable) af = AF_INET6; #endif else if (arg1 == &V_fwlink_enable) af = AF_LINK; else return (EINVAL); newval = *(int *)arg1; /* Handle sysctl change */ error = sysctl_handle_int(oidp, &newval, 0, req); if (error) return (error); /* Formalize new value */ newval = (newval) ? 1 : 0; if (*(int *)arg1 == newval) return (0); error = ipfw_hook(newval, af); if (error) return (error); *(int *)arg1 = newval; return (0); } /* end of file */ Index: head/sys/netpfil/ipfw/ip_fw_private.h =================================================================== --- head/sys/netpfil/ipfw/ip_fw_private.h (revision 345161) +++ head/sys/netpfil/ipfw/ip_fw_private.h (revision 345162) @@ -1,838 +1,845 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, IP_FW_NAT64, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { uint32_t flags; -#define IPFW_ARGS_ETHER 0x0001 /* has valid ethernet header */ -#define IPFW_ARGS_NH4 0x0002 /* has IPv4 next hop in hopstore */ -#define IPFW_ARGS_NH6 0x0004 /* has IPv6 next hop in hopstore */ -#define IPFW_ARGS_NH4PTR 0x0008 /* has IPv4 next hop in next_hop */ -#define IPFW_ARGS_NH6PTR 0x0010 /* has IPv6 next hop in next_hop6 */ -#define IPFW_ARGS_REF 0x0020 /* has valid ipfw_rule_ref */ +#define IPFW_ARGS_ETHER 0x00010000 /* valid ethernet header */ +#define IPFW_ARGS_NH4 0x00020000 /* IPv4 next hop in hopstore */ +#define IPFW_ARGS_NH6 0x00040000 /* IPv6 next hop in hopstore */ +#define IPFW_ARGS_NH4PTR 0x00080000 /* IPv4 next hop in next_hop */ +#define IPFW_ARGS_NH6PTR 0x00100000 /* IPv6 next hop in next_hop6 */ +#define IPFW_ARGS_REF 0x00200000 /* valid ipfw_rule_ref */ +#define IPFW_ARGS_IN 0x00400000 /* called on input */ +#define IPFW_ARGS_OUT 0x00800000 /* called on output */ +#define IPFW_ARGS_IP4 0x01000000 /* belongs to v4 ISR */ +#define IPFW_ARGS_IP6 0x02000000 /* belongs to v6 ISR */ +#define IPFW_ARGS_DROP 0x04000000 /* drop it (dummynet) */ +#define IPFW_ARGS_LENMASK 0x0000ffff /* length of data in *mem */ +#define IPFW_ARGS_LENGTH(f) ((f) & IPFW_ARGS_LENMASK) /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ - struct ifnet *oif; /* output interface */ + struct ifnet *ifp; /* input/output interface */ struct inpcb *inp; union { /* * We don't support forwarding on layer2, thus we can * keep eh pointer in this union. * next_hop[6] pointers can be used to point to next hop * stored in rule's opcode to avoid copying into hopstore. * Also, it is expected that all 0x1-0x10 flags are mutually * exclusive. */ struct ether_header *eh; /* for bridged packets */ struct sockaddr_in *next_hop; struct sockaddr_in6 *next_hop6; /* ipfw next hop storage */ struct sockaddr_in hopstore; struct ip_fw_nh6 { struct in6_addr sin6_addr; uint32_t sin6_scope_id; uint16_t sin6_port; } hopstore6; }; struct mbuf *m; /* the mbuf chain */ struct ipfw_flow_id f_id; /* grabbed from IP header */ }; MALLOC_DECLARE(M_IPFW); /* * Hooks sometime need to know the direction of the packet * (divert, dummynet, netgraph, ...) * We use a generic definition here, with bit0-1 indicating the * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the * specific protocol * indicating the protocol (if necessary) */ enum { DIR_MASK = 0x3, DIR_OUT = 0, DIR_IN = 1, DIR_FWD = 2, DIR_DROP = 3, PROTO_LAYER2 = 0x4, /* set for layer 2 */ /* PROTO_DEFAULT = 0, */ PROTO_IPV4 = 0x08, PROTO_IPV6 = 0x10, PROTO_IFB = 0x0c, /* layer2 + ifbridge */ /* PROTO_OLDBDG = 0x14, unused, old bridge */ }; /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ int ipfw_chk(struct ip_fw_args *args); struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); /* attach (arg = 1) or detach (arg = 0) hooks */ int ipfw_attach_hooks(int); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; struct ip_fw_chain; void ipfw_bpf_init(int); void ipfw_bpf_uninit(int); void ipfw_bpf_mtap2(void *, u_int, struct mbuf *); void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, - struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + struct ip_fw_args *args, struct mbuf *m, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ struct sockopt_data; enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * Macro to determine that we need to do or redo dynamic state lookup. * direction == MATCH_UNKNOWN means that this is first lookup, then we need * to do lookup. * Otherwise check the state name, if previous lookup was for "any" name, * this means there is no state with specific name. Thus no need to do * lookup. If previous name was not "any", redo lookup for specific name. */ #define DYN_LOOKUP_NEEDED(p, cmd) \ ((p)->direction == MATCH_UNKNOWN || \ ((p)->kidx != 0 && (p)->kidx != (cmd)->arg1)) #define DYN_INFO_INIT(p) do { \ (p)->direction = MATCH_UNKNOWN; \ (p)->kidx = 0; \ } while (0) struct ipfw_dyn_info { uint16_t direction; /* match direction */ uint16_t kidx; /* state name kidx */ uint32_t hashval; /* hash value */ uint32_t version; /* bucket version */ uint32_t f_pos; }; int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, const ipfw_insn_limit *cmd, const struct ip_fw_args *args, const void *ulp, int pktlen, struct ipfw_dyn_info *info, uint32_t tablearg); struct ip_fw *ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info); int ipfw_is_dyn_rule(struct ip_fw *rule); void ipfw_expire_dyn_states(struct ip_fw_chain *, ipfw_range_tlv *); void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); uint32_t ipfw_dyn_get_count(uint32_t *, int *); void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(int, ipfw_vnet_ready); #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) VNET_DECLARE(unsigned int, fw_tables_sets); #define V_fw_tables_sets VNET(fw_tables_sets) struct tables_config; #ifdef _KERNEL /* * Here we have the structure representing an ipfw rule. * * It starts with a general area * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). */ struct ip_fw { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t flags; /* currently unused */ counter_u64_t cntr; /* Pointer to rule counters */ uint32_t timestamp; /* tv_sec of last match */ uint32_t id; /* rule id */ uint32_t cached_id; /* used by jump_fast */ uint32_t cached_pos; /* used by jump_fast */ uint32_t refcnt; /* number of references */ struct ip_fw *next; /* linked list of deleted rules */ ipfw_insn cmd[1]; /* storage for commands */ }; #define IPFW_RULE_CNTR_SIZE (2 * sizeof(uint64_t)) #endif struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ void *tablestate; /* runtime table info */ void *valuestate; /* runtime table value info */ int *idxmap; /* skipto array of rules */ void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else struct rmlock rwmtx; #endif int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct ip_fw *default_rule; struct tables_config *tblcfg; /* tables module data */ void *ifcfg; /* interface module data */ int *idxmap_back; /* standby skipto array of rules */ struct namedobj_instance *srvmap; /* cfg name->number mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else struct rwlock uh_lock; /* lock for upper half */ #endif }; /* 64-byte structure representing multi-field table value */ struct table_value { uint32_t tag; /* O_TAG/O_TAGGED */ uint32_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ uint16_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ uint32_t fib; /* O_SETFIB */ uint32_t nat; /* O_NAT */ uint32_t nh4; uint8_t dscp; uint8_t spare0; uint16_t spare1; /* -- 32 bytes -- */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t zoneid; /* scope zone id for nh6 */ uint64_t refcnt; /* Number of references */ }; struct named_object { TAILQ_ENTRY(named_object) nn_next; /* namehash */ TAILQ_ENTRY(named_object) nv_next; /* valuehash */ char *name; /* object name */ uint16_t etlv; /* Export TLV id */ uint8_t subtype;/* object subtype within class */ uint8_t set; /* set object belongs to */ uint16_t kidx; /* object kernel index */ uint16_t spare; uint32_t ocnt; /* object counter for internal use */ uint32_t refcnt; /* number of references */ }; TAILQ_HEAD(namedobjects_head, named_object); struct sockopt; /* used by tcp_var.h */ struct sockopt_data { caddr_t kbuf; /* allocated buffer */ size_t ksize; /* given buffer size */ size_t koff; /* data already used */ size_t kavail; /* number of bytes available */ size_t ktotal; /* total bytes pushed */ struct sockopt *sopt; /* socket data */ caddr_t sopt_val; /* sopt user buffer */ size_t valsize; /* original data size */ }; struct ipfw_ifc; typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); struct ipfw_iface { struct named_object no; char ifname[64]; int resolved; uint16_t ifindex; uint16_t spare; uint64_t gencnt; TAILQ_HEAD(, ipfw_ifc) consumers; }; struct ipfw_ifc { TAILQ_ENTRY(ipfw_ifc) next; struct ipfw_iface *iface; ipfw_ifc_cb *cb; void *cbdata; }; /* Macro for working with various counters */ #define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ counter_u64_add((_cntr)->cntr, 1); \ counter_u64_add((_cntr)->cntr + 1, _bytes); \ if ((_cntr)->timestamp != time_uptime) \ (_cntr)->timestamp = time_uptime; \ } while (0) #define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ (_cntr)->pcnt++; \ (_cntr)->bcnt += _bytes; \ } while (0) #define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ counter_u64_zero((_cntr)->cntr); \ counter_u64_zero((_cntr)->cntr + 1); \ (_cntr)->timestamp = 0; \ } while (0) #define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ (_cntr)->pcnt = 0; \ (_cntr)->bcnt = 0; \ } while (0) #define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f #define IP_FW_ARG_TABLEARG(ch, a, f) \ (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #if defined( __linux__ ) || defined( _WIN32 ) #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #else /* FreeBSD */ #define IPFW_LOCK_INIT(_chain) do { \ rm_init_flags(&(_chain)->rwmtx, "IPFW static rules", RM_RECURSE); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rm_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker #define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) #define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) #define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) #define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) #define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) struct obj_idx { uint16_t uidx; /* internal index supplied by userland */ uint16_t kidx; /* kernel object index */ uint16_t off; /* tlv offset from rule end in 4-byte words */ uint8_t spare; uint8_t type; /* object type within its category */ }; struct rule_check_info { uint16_t flags; /* rule-specific check flags */ uint16_t object_opcodes; /* num of opcodes referencing objects */ uint16_t urule_numoff; /* offset of rulenum in bytes */ uint8_t version; /* rule version */ uint8_t spare; ipfw_obj_ctlv *ctlv; /* name TLV containter */ struct ip_fw *krule; /* resulting rule pointer */ caddr_t urule; /* original rule pointer */ struct obj_idx obuf[8]; /* table references storage */ }; /* Legacy interface support */ /* * FreeBSD 8 export rule format */ struct ip_fw_rule0 { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; struct ip_fw_bcounter0 { uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ }; /* Kernel rule length */ /* * RULE _K_ SIZE _V_ -> * get kernel size from userland rool version _V_. * RULE _U_ SIZE _V_ -> * get user size version _V_ from kernel rule * RULESIZE _V_ -> * get user size rule length */ /* FreeBSD8 <> current kernel format */ #define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) #define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* FreeBSD11 <> current kernel format */ #define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ (r)->cmd_len * 4 - 4, 8)) #define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* * Tables/Objects index rewriting code */ /* Default and maximum number of ipfw tables/objects. */ #define IPFW_TABLES_MAX 65536 #define IPFW_TABLES_DEFAULT 128 #define IPFW_OBJECTS_MAX 65536 #define IPFW_OBJECTS_DEFAULT 1024 #define CHAIN_TO_SRV(ch) ((ch)->srvmap) #define SRV_OBJECT(ch, idx) ((ch)->srvstate[(idx)]) struct tid_info { uint32_t set; /* table set */ uint16_t uidx; /* table index */ uint8_t type; /* table type */ uint8_t atype; uint8_t spare; int tlen; /* Total TLV size block */ void *tlvs; /* Pointer to first TLV */ }; /* * Classifier callback. Checks if @cmd opcode contains kernel object reference. * If true, returns its index and type. * Returns 0 if match is found, 1 overwise. */ typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); /* * Updater callback. Sets kernel object reference index to @puidx */ typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx); /* * Finder callback. Tries to find named object by name (specified via @ti). * Stores found named object pointer in @pno. * If object was not found, NULL is stored. * * Return 0 if input data was valid. */ typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno); /* * Another finder callback. Tries to findex named object by kernel index. * * Returns pointer to named object or NULL. */ typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch, uint16_t kidx); /* * Object creator callback. Tries to create object specified by @ti. * Stores newly-allocated object index in @pkidx. * * Returns 0 on success. */ typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti, uint16_t *pkidx); /* * Object destroy callback. Intended to free resources allocated by * create_object callback. */ typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch, struct named_object *no); /* * Sets handler callback. Handles moving and swaping set of named object. * SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa; * TEST_ALL checks that there aren't any named object with conflicting names; * MOVE_ALL moves all named objects from set `set' to `new_set'; * COUNT_ONE used to count number of references used by object with kidx `set'; * TEST_ONE checks that named object with kidx `set' can be moved to `new_set`; * MOVE_ONE moves named object with kidx `set' to set `new_set'. */ enum ipfw_sets_cmd { SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE }; typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); struct opcode_obj_rewrite { uint32_t opcode; /* Opcode to act upon */ uint32_t etlv; /* Relevant export TLV id */ ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */ ipfw_obj_rw_upd *update; /* update cmd with new value */ ipfw_obj_fname_cb *find_byname; /* Find named object by name */ ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */ ipfw_obj_create_cb *create_object; /* Create named object */ ipfw_obj_destroy_cb *destroy_object;/* Destroy named object */ ipfw_obj_sets_cb *manage_sets; /* Swap or move sets */ }; #define IPFW_ADD_OBJ_REWRITER(f, c) do { \ if ((f) != 0) \ ipfw_add_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_OBJ_REWRITER(l, c) do { \ if ((l) != 0) \ ipfw_del_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) /* In ip_fw_iface.c */ int ipfw_iface_init(void); void ipfw_iface_destroy(void); void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic); void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); /* In ip_fw_sockopt.c */ void ipfw_init_skipto_cache(struct ip_fw_chain *chain); void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_ctl3(struct sockopt *sopt); int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule, int locked); void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule); void ipfw_reap_rules(struct ip_fw *head); void ipfw_init_counters(void); void ipfw_destroy_counters(void); struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); void ipfw_free_rule(struct ip_fw *rule); int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx); typedef int (sopt_handler_f)(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); struct ipfw_sopt_handler { uint16_t opcode; uint8_t version; uint8_t dir; sopt_handler_f *handler; uint64_t refcnt; }; #define HDIR_SET 0x01 /* Handler is used to set some data */ #define HDIR_GET 0x02 /* Handler is used to retrieve data */ #define HDIR_BOTH HDIR_GET|HDIR_SET void ipfw_init_sopt_handler(void); void ipfw_destroy_sopt_handler(void); void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); #define IPFW_ADD_SOPT_HANDLER(f, c) do { \ if ((f) != 0) \ ipfw_add_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_SOPT_HANDLER(l, c) do { \ if ((l) != 0) \ ipfw_del_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) struct namedobj_instance; typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, void *arg); typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key, uint32_t kopt); typedef int (objhash_cmp_f)(struct named_object *no, const void *key, uint32_t kopt); struct namedobj_instance *ipfw_objhash_create(uint32_t items); void ipfw_objhash_destroy(struct namedobj_instance *); void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_free(void *idx, int blocks); void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name); struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, uint32_t type, const char *name); struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b); void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); uint32_t ipfw_objhash_count(struct namedobj_instance *ni); uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type); int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg); int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, void *arg, uint16_t type); int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, uint32_t etlv, struct named_object **pno); void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv); ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv); void ipfw_init_obj_rewriter(void); void ipfw_destroy_obj_rewriter(void); void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti); void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx); int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx); void ipfw_init_srv(struct ip_fw_chain *ch); void ipfw_destroy_srv(struct ip_fw_chain *ch); int ipfw_check_object_name_generic(const char *name); int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); /* In ip_fw_eaction.c */ typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_eaction_init(struct ip_fw_chain *ch, int first); void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last); uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name); int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id); int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id); int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t instance_id); /* In ip_fw_table.c */ struct table_info; typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, void *paddr, uint32_t *val); struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx); int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx); void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx); int ipfw_init_tables(struct ip_fw_chain *ch, int first); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); VNET_DECLARE(int, ipfw_nat_ready); #define V_ipfw_nat_ready VNET(ipfw_nat_ready) #define IPFW_NAT_LOADED (V_ipfw_nat_ready) extern ipfw_nat_t *ipfw_nat_ptr; extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; /* Helper functions for IP checksum adjustment */ static __inline uint16_t cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } static __inline uint16_t cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new) { return (~cksum_add(cksum_add(~oldsum, ~old), new)); } #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */