diff --git a/sbin/pfctl/pf_print_state.c b/sbin/pfctl/pf_print_state.c index f0ad9a427006..cabfc78de97a 100644 --- a/sbin/pfctl/pf_print_state.c +++ b/sbin/pfctl/pf_print_state.c @@ -1,431 +1,465 @@ /* $OpenBSD: pf_print_state.c,v 1.52 2008/08/12 16:40:18 david Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #define TCPSTATES #include +#include #include #include #include #include #include #include #include "pfctl_parser.h" #include "pfctl.h" void print_name(struct pf_addr *, sa_family_t); void print_addr(struct pf_addr_wrap *addr, sa_family_t af, int verbose) { switch (addr->type) { case PF_ADDR_DYNIFTL: printf("(%s", addr->v.ifname); if (addr->iflags & PFI_AFLAG_NETWORK) printf(":network"); if (addr->iflags & PFI_AFLAG_BROADCAST) printf(":broadcast"); if (addr->iflags & PFI_AFLAG_PEER) printf(":peer"); if (addr->iflags & PFI_AFLAG_NOALIAS) printf(":0"); if (verbose) { if (addr->p.dyncnt <= 0) printf(":*"); else printf(":%d", addr->p.dyncnt); } printf(")"); break; case PF_ADDR_TABLE: if (verbose) if (addr->p.tblcnt == -1) printf("<%s:*>", addr->v.tblname); else printf("<%s:%d>", addr->v.tblname, addr->p.tblcnt); else printf("<%s>", addr->v.tblname); return; case PF_ADDR_RANGE: { char buf[48]; if (inet_ntop(af, &addr->v.a.addr, buf, sizeof(buf)) == NULL) printf("?"); else printf("%s", buf); if (inet_ntop(af, &addr->v.a.mask, buf, sizeof(buf)) == NULL) printf(" - ?"); else printf(" - %s", buf); break; } case PF_ADDR_ADDRMASK: if (PF_AZERO(&addr->v.a.addr, AF_INET6) && PF_AZERO(&addr->v.a.mask, AF_INET6)) printf("any"); else { char buf[48]; if (inet_ntop(af, &addr->v.a.addr, buf, sizeof(buf)) == NULL) printf("?"); else printf("%s", buf); } break; case PF_ADDR_NOROUTE: printf("no-route"); return; case PF_ADDR_URPFFAILED: printf("urpf-failed"); return; default: printf("?"); return; } /* mask if not _both_ address and mask are zero */ if (addr->type != PF_ADDR_RANGE && !(PF_AZERO(&addr->v.a.addr, AF_INET6) && PF_AZERO(&addr->v.a.mask, AF_INET6))) { int bits = unmask(&addr->v.a.mask, af); if (bits != (af == AF_INET ? 32 : 128)) printf("/%d", bits); } } void print_name(struct pf_addr *addr, sa_family_t af) { char host[NI_MAXHOST]; strlcpy(host, "?", sizeof(host)); switch (af) { case AF_INET: { struct sockaddr_in sin; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr = addr->v4; getnameinfo((struct sockaddr *)&sin, sin.sin_len, host, sizeof(host), NULL, 0, NI_NOFQDN); break; } case AF_INET6: { struct sockaddr_in6 sin6; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = addr->v6; getnameinfo((struct sockaddr *)&sin6, sin6.sin6_len, host, sizeof(host), NULL, 0, NI_NOFQDN); break; } } printf("%s", host); } void print_host(struct pf_addr *addr, u_int16_t port, sa_family_t af, int opts) { if (opts & PF_OPT_USEDNS) print_name(addr, af); else { struct pf_addr_wrap aw; memset(&aw, 0, sizeof(aw)); aw.v.a.addr = *addr; if (af == AF_INET) aw.v.a.mask.addr32[0] = 0xffffffff; else { memset(&aw.v.a.mask, 0xff, sizeof(aw.v.a.mask)); af = AF_INET6; } print_addr(&aw, af, opts & PF_OPT_VERBOSE2); } if (port) { if (af == AF_INET) printf(":%u", ntohs(port)); else printf("[%u]", ntohs(port)); } } void print_seq(struct pfctl_state_peer *p) { if (p->seqdiff) printf("[%u + %u](+%u)", p->seqlo, p->seqhi - p->seqlo, p->seqdiff); else printf("[%u + %u]", p->seqlo, p->seqhi - p->seqlo); } + +static const char * +sctp_state_name(int state) +{ + switch (state) { + case SCTP_CLOSED: + return ("CLOSED"); + case SCTP_BOUND: + return ("BOUND"); + case SCTP_LISTEN: + return ("LISTEN"); + case SCTP_COOKIE_WAIT: + return ("COOKIE_WAIT"); + case SCTP_COOKIE_ECHOED: + return ("COOKIE_ECHOED"); + case SCTP_ESTABLISHED: + return ("ESTABLISHED"); + case SCTP_SHUTDOWN_SENT: + return ("SHUTDOWN_SENT"); + case SCTP_SHUTDOWN_RECEIVED: + return ("SHUTDOWN_RECEIVED"); + case SCTP_SHUTDOWN_ACK_SENT: + return ("SHUTDOWN_ACK_SENT"); + case SCTP_SHUTDOWN_PENDING: + return ("SHUTDOWN_PENDING"); + default: + return ("?"); + } +} + void print_state(struct pfctl_state *s, int opts) { struct pfctl_state_peer *src, *dst; struct pfctl_state_key *key, *sk, *nk; const char *protoname; int min, sec; sa_family_t af; uint8_t proto; #ifndef __NO_STRICT_ALIGNMENT struct pfctl_state_key aligned_key[2]; bcopy(&s->key, aligned_key, sizeof(aligned_key)); key = aligned_key; #else key = s->key; #endif af = s->key[PF_SK_WIRE].af; proto = s->key[PF_SK_WIRE].proto; if (s->direction == PF_OUT) { src = &s->src; dst = &s->dst; sk = &key[PF_SK_STACK]; nk = &key[PF_SK_WIRE]; if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) sk->port[0] = nk->port[0]; } else { src = &s->dst; dst = &s->src; sk = &key[PF_SK_WIRE]; nk = &key[PF_SK_STACK]; if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) sk->port[1] = nk->port[1]; } printf("%s ", s->ifname); if ((protoname = pfctl_proto2name(proto)) != NULL) printf("%s ", protoname); else printf("%u ", proto); print_host(&nk->addr[1], nk->port[1], af, opts); if (PF_ANEQ(&nk->addr[1], &sk->addr[1], af) || nk->port[1] != sk->port[1]) { printf(" ("); print_host(&sk->addr[1], sk->port[1], af, opts); printf(")"); } if (s->direction == PF_OUT) printf(" -> "); else printf(" <- "); print_host(&nk->addr[0], nk->port[0], af, opts); if (PF_ANEQ(&nk->addr[0], &sk->addr[0], af) || nk->port[0] != sk->port[0]) { printf(" ("); print_host(&sk->addr[0], sk->port[0], af, opts); printf(")"); } printf(" "); if (proto == IPPROTO_TCP) { if (src->state <= TCPS_TIME_WAIT && dst->state <= TCPS_TIME_WAIT) printf(" %s:%s\n", tcpstates[src->state], tcpstates[dst->state]); else if (src->state == PF_TCPS_PROXY_SRC || dst->state == PF_TCPS_PROXY_SRC) printf(" PROXY:SRC\n"); else if (src->state == PF_TCPS_PROXY_DST || dst->state == PF_TCPS_PROXY_DST) printf(" PROXY:DST\n"); else printf(" \n", src->state, dst->state); if (opts & PF_OPT_VERBOSE) { printf(" "); print_seq(src); if (src->wscale && dst->wscale) printf(" wscale %u", src->wscale & PF_WSCALE_MASK); printf(" "); print_seq(dst); if (src->wscale && dst->wscale) printf(" wscale %u", dst->wscale & PF_WSCALE_MASK); printf("\n"); } } else if (proto == IPPROTO_UDP && src->state < PFUDPS_NSTATES && dst->state < PFUDPS_NSTATES) { const char *states[] = PFUDPS_NAMES; printf(" %s:%s\n", states[src->state], states[dst->state]); + } else if (proto == IPPROTO_SCTP) { + printf(" %s:%s\n", sctp_state_name(src->state), + sctp_state_name(dst->state)); #ifndef INET6 } else if (proto != IPPROTO_ICMP && src->state < PFOTHERS_NSTATES && dst->state < PFOTHERS_NSTATES) { #else } else if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6 && src->state < PFOTHERS_NSTATES && dst->state < PFOTHERS_NSTATES) { #endif /* XXX ICMP doesn't really have state levels */ const char *states[] = PFOTHERS_NAMES; printf(" %s:%s\n", states[src->state], states[dst->state]); } else { printf(" %u:%u\n", src->state, dst->state); } if (opts & PF_OPT_VERBOSE) { u_int32_t creation = s->creation; u_int32_t expire = s->expire; sec = creation % 60; creation /= 60; min = creation % 60; creation /= 60; printf(" age %.2u:%.2u:%.2u", creation, min, sec); sec = expire % 60; expire /= 60; min = expire % 60; expire /= 60; printf(", expires in %.2u:%.2u:%.2u", expire, min, sec); printf(", %ju:%ju pkts, %ju:%ju bytes", s->packets[0], s->packets[1], s->bytes[0], s->bytes[1]); if (s->anchor != -1) printf(", anchor %u", s->anchor); if (s->rule != -1) printf(", rule %u", s->rule); if (s->state_flags & PFSTATE_ALLOWOPTS) printf(", allow-opts"); if (s->state_flags & PFSTATE_SLOPPY) printf(", sloppy"); if (s->state_flags & PFSTATE_NOSYNC) printf(", no-sync"); if (s->state_flags & PFSTATE_ACK) printf(", psync-ack"); if (s->state_flags & PFSTATE_NODF) printf(", no-df"); if (s->state_flags & PFSTATE_SETTOS) printf(", set-tos 0x%2.2x", s->set_tos); if (s->state_flags & PFSTATE_RANDOMID) printf(", random-id"); if (s->state_flags & PFSTATE_SCRUB_TCP) printf(", reassemble-tcp"); if (s->state_flags & PFSTATE_SETPRIO) printf(", set-prio (0x%02x 0x%02x)", s->set_prio[0], s->set_prio[1]); if (s->dnpipe || s->dnrpipe) { if (s->state_flags & PFSTATE_DN_IS_PIPE) printf(", dummynet pipe (%d %d)", s->dnpipe, s->dnrpipe); if (s->state_flags & PFSTATE_DN_IS_QUEUE) printf(", dummynet queue (%d %d)", s->dnpipe, s->dnrpipe); } if (s->sync_flags & PFSYNC_FLAG_SRCNODE) printf(", source-track"); if (s->sync_flags & PFSYNC_FLAG_NATSRCNODE) printf(", sticky-address"); if (s->log) printf(", log"); if (s->log & PF_LOG_ALL) printf(" (all)"); if (s->min_ttl) printf(", min-ttl %d", s->min_ttl); if (s->max_mss) printf(", max-mss %d", s->max_mss); printf("\n"); } if (opts & PF_OPT_VERBOSE2) { u_int64_t id; bcopy(&s->id, &id, sizeof(u_int64_t)); printf(" id: %016jx creatorid: %08x", id, s->creatorid); if (s->rt) { switch (s->rt) { case PF_ROUTETO: printf(" route-to: "); break; case PF_DUPTO: printf(" dup-to: "); break; case PF_REPLYTO: printf(" reply-to: "); break; default: printf(" gateway: "); } print_host(&s->rt_addr, 0, af, opts); if (s->rt_ifname[0]) printf("@%s", s->rt_ifname); } if (s->rtableid != -1) printf(" rtable: %d", s->rtableid); printf("\n"); if (strcmp(s->ifname, s->orig_ifname) != 0) printf(" origif: %s\n", s->orig_ifname); } } int unmask(struct pf_addr *m, sa_family_t af) { int i = 31, j = 0, b = 0; u_int32_t tmp; while (j < 4 && m->addr32[j] == 0xffffffff) { b += 32; j++; } if (j < 4) { tmp = ntohl(m->addr32[j]); for (i = 31; tmp & (1 << i); --i) b++; } return (b); } diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index e0c5fb5214a8..86e15f7c1709 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -1,2490 +1,2503 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ * $FreeBSD$ */ #ifndef _NET_PFVAR_H_ #define _NET_PFVAR_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include +#include #include #include #endif #include #include #include #ifdef _KERNEL #if defined(__arm__) #define PF_WANT_32_TO_64_COUNTER #endif /* * A hybrid of 32-bit and 64-bit counters which can be used on platforms where * counter(9) is very expensive. * * As 32-bit counters are expected to overflow, a periodic job sums them up to * a saved 64-bit state. Fetching the value still walks all CPUs to get the most * current snapshot. */ #ifdef PF_WANT_32_TO_64_COUNTER struct pf_counter_u64_pcpu { u_int32_t current; u_int32_t snapshot; }; struct pf_counter_u64 { struct pf_counter_u64_pcpu *pfcu64_pcpu; u_int64_t pfcu64_value; seqc_t pfcu64_seqc; }; static inline int pf_counter_u64_init(struct pf_counter_u64 *pfcu64, int flags) { pfcu64->pfcu64_value = 0; pfcu64->pfcu64_seqc = 0; pfcu64->pfcu64_pcpu = uma_zalloc_pcpu(pcpu_zone_8, flags | M_ZERO); if (__predict_false(pfcu64->pfcu64_pcpu == NULL)) return (ENOMEM); return (0); } static inline void pf_counter_u64_deinit(struct pf_counter_u64 *pfcu64) { uma_zfree_pcpu(pcpu_zone_8, pfcu64->pfcu64_pcpu); } static inline void pf_counter_u64_critical_enter(void) { critical_enter(); } static inline void pf_counter_u64_critical_exit(void) { critical_exit(); } static inline void pf_counter_u64_add_protected(struct pf_counter_u64 *pfcu64, uint32_t n) { struct pf_counter_u64_pcpu *pcpu; u_int32_t val; MPASS(curthread->td_critnest > 0); pcpu = zpcpu_get(pfcu64->pfcu64_pcpu); val = atomic_load_int(&pcpu->current); atomic_store_int(&pcpu->current, val + n); } static inline void pf_counter_u64_add(struct pf_counter_u64 *pfcu64, uint32_t n) { critical_enter(); pf_counter_u64_add_protected(pfcu64, n); critical_exit(); } static inline u_int64_t pf_counter_u64_periodic(struct pf_counter_u64 *pfcu64) { struct pf_counter_u64_pcpu *pcpu; u_int64_t sum; u_int32_t val; int cpu; MPASS(curthread->td_critnest > 0); seqc_write_begin(&pfcu64->pfcu64_seqc); sum = pfcu64->pfcu64_value; CPU_FOREACH(cpu) { pcpu = zpcpu_get_cpu(pfcu64->pfcu64_pcpu, cpu); val = atomic_load_int(&pcpu->current); sum += (uint32_t)(val - pcpu->snapshot); pcpu->snapshot = val; } pfcu64->pfcu64_value = sum; seqc_write_end(&pfcu64->pfcu64_seqc); return (sum); } static inline u_int64_t pf_counter_u64_fetch(const struct pf_counter_u64 *pfcu64) { struct pf_counter_u64_pcpu *pcpu; u_int64_t sum; seqc_t seqc; int cpu; for (;;) { seqc = seqc_read(&pfcu64->pfcu64_seqc); sum = 0; CPU_FOREACH(cpu) { pcpu = zpcpu_get_cpu(pfcu64->pfcu64_pcpu, cpu); sum += (uint32_t)(atomic_load_int(&pcpu->current) -pcpu->snapshot); } sum += pfcu64->pfcu64_value; if (seqc_consistent(&pfcu64->pfcu64_seqc, seqc)) break; } return (sum); } static inline void pf_counter_u64_zero_protected(struct pf_counter_u64 *pfcu64) { struct pf_counter_u64_pcpu *pcpu; int cpu; MPASS(curthread->td_critnest > 0); seqc_write_begin(&pfcu64->pfcu64_seqc); CPU_FOREACH(cpu) { pcpu = zpcpu_get_cpu(pfcu64->pfcu64_pcpu, cpu); pcpu->snapshot = atomic_load_int(&pcpu->current); } pfcu64->pfcu64_value = 0; seqc_write_end(&pfcu64->pfcu64_seqc); } static inline void pf_counter_u64_zero(struct pf_counter_u64 *pfcu64) { critical_enter(); pf_counter_u64_zero_protected(pfcu64); critical_exit(); } #else struct pf_counter_u64 { counter_u64_t counter; }; static inline int pf_counter_u64_init(struct pf_counter_u64 *pfcu64, int flags) { pfcu64->counter = counter_u64_alloc(flags); if (__predict_false(pfcu64->counter == NULL)) return (ENOMEM); return (0); } static inline void pf_counter_u64_deinit(struct pf_counter_u64 *pfcu64) { counter_u64_free(pfcu64->counter); } static inline void pf_counter_u64_critical_enter(void) { } static inline void pf_counter_u64_critical_exit(void) { } static inline void pf_counter_u64_add_protected(struct pf_counter_u64 *pfcu64, uint32_t n) { counter_u64_add(pfcu64->counter, n); } static inline void pf_counter_u64_add(struct pf_counter_u64 *pfcu64, uint32_t n) { pf_counter_u64_add_protected(pfcu64, n); } static inline u_int64_t pf_counter_u64_fetch(const struct pf_counter_u64 *pfcu64) { return (counter_u64_fetch(pfcu64->counter)); } static inline void pf_counter_u64_zero_protected(struct pf_counter_u64 *pfcu64) { counter_u64_zero(pfcu64->counter); } static inline void pf_counter_u64_zero(struct pf_counter_u64 *pfcu64) { pf_counter_u64_zero_protected(pfcu64); } #endif #define pf_get_timestamp(prule)({ \ uint32_t _ts = 0; \ uint32_t __ts; \ int cpu; \ CPU_FOREACH(cpu) { \ __ts = *zpcpu_get_cpu(prule->timestamp, cpu); \ if (__ts > _ts) \ _ts = __ts; \ } \ _ts; \ }) #define pf_update_timestamp(prule) \ do { \ critical_enter(); \ *zpcpu_get((prule)->timestamp) = time_second; \ critical_exit(); \ } while (0) #define pf_timestamp_pcpu_zone (sizeof(time_t) == 4 ? pcpu_zone_4 : pcpu_zone_8) _Static_assert(sizeof(time_t) == 4 || sizeof(time_t) == 8, "unexpected time_t size"); SYSCTL_DECL(_net_pf); MALLOC_DECLARE(M_PFHASH); MALLOC_DECLARE(M_PF_RULE_ITEM); SDT_PROVIDER_DECLARE(pf); struct pfi_dynaddr { TAILQ_ENTRY(pfi_dynaddr) entry; struct pf_addr pfid_addr4; struct pf_addr pfid_mask4; struct pf_addr pfid_addr6; struct pf_addr pfid_mask6; struct pfr_ktable *pfid_kt; struct pfi_kkif *pfid_kif; int pfid_net; /* mask or 128 */ int pfid_acnt4; /* address count IPv4 */ int pfid_acnt6; /* address count IPv6 */ sa_family_t pfid_af; /* rule af */ u_int8_t pfid_iflags; /* PFI_AFLAG_* */ }; /* * Address manipulation macros */ #define HTONL(x) (x) = htonl((__uint32_t)(x)) #define HTONS(x) (x) = htons((__uint16_t)(x)) #define NTOHL(x) (x) = ntohl((__uint32_t)(x)) #define NTOHS(x) (x) = ntohs((__uint16_t)(x)) #define PF_NAME "pf" #define PF_HASHROW_ASSERT(h) mtx_assert(&(h)->lock, MA_OWNED) #define PF_HASHROW_LOCK(h) mtx_lock(&(h)->lock) #define PF_HASHROW_UNLOCK(h) mtx_unlock(&(h)->lock) #ifdef INVARIANTS #define PF_STATE_LOCK(s) \ do { \ struct pf_kstate *_s = (s); \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(_s)]; \ MPASS(_s->lock == &_ih->lock); \ mtx_lock(_s->lock); \ } while (0) #define PF_STATE_UNLOCK(s) \ do { \ struct pf_kstate *_s = (s); \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(_s)]; \ MPASS(_s->lock == &_ih->lock); \ mtx_unlock(_s->lock); \ } while (0) #else #define PF_STATE_LOCK(s) mtx_lock(s->lock) #define PF_STATE_UNLOCK(s) mtx_unlock(s->lock) #endif #ifdef INVARIANTS #define PF_STATE_LOCK_ASSERT(s) \ do { \ struct pf_kstate *_s = (s); \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(_s)]; \ MPASS(_s->lock == &_ih->lock); \ PF_HASHROW_ASSERT(_ih); \ } while (0) #else /* !INVARIANTS */ #define PF_STATE_LOCK_ASSERT(s) do {} while (0) #endif /* INVARIANTS */ #ifdef INVARIANTS #define PF_SRC_NODE_LOCK(sn) \ do { \ struct pf_ksrc_node *_sn = (sn); \ struct pf_srchash *_sh = &V_pf_srchash[ \ pf_hashsrc(&_sn->addr, _sn->af)]; \ MPASS(_sn->lock == &_sh->lock); \ mtx_lock(_sn->lock); \ } while (0) #define PF_SRC_NODE_UNLOCK(sn) \ do { \ struct pf_ksrc_node *_sn = (sn); \ struct pf_srchash *_sh = &V_pf_srchash[ \ pf_hashsrc(&_sn->addr, _sn->af)]; \ MPASS(_sn->lock == &_sh->lock); \ mtx_unlock(_sn->lock); \ } while (0) #else #define PF_SRC_NODE_LOCK(sn) mtx_lock((sn)->lock) #define PF_SRC_NODE_UNLOCK(sn) mtx_unlock((sn)->lock) #endif #ifdef INVARIANTS #define PF_SRC_NODE_LOCK_ASSERT(sn) \ do { \ struct pf_ksrc_node *_sn = (sn); \ struct pf_srchash *_sh = &V_pf_srchash[ \ pf_hashsrc(&_sn->addr, _sn->af)]; \ MPASS(_sn->lock == &_sh->lock); \ PF_HASHROW_ASSERT(_sh); \ } while (0) #else /* !INVARIANTS */ #define PF_SRC_NODE_LOCK_ASSERT(sn) do {} while (0) #endif /* INVARIANTS */ extern struct mtx_padalign pf_unlnkdrules_mtx; #define PF_UNLNKDRULES_LOCK() mtx_lock(&pf_unlnkdrules_mtx) #define PF_UNLNKDRULES_UNLOCK() mtx_unlock(&pf_unlnkdrules_mtx) #define PF_UNLNKDRULES_ASSERT() mtx_assert(&pf_unlnkdrules_mtx, MA_OWNED) extern struct sx pf_config_lock; #define PF_CONFIG_LOCK() sx_xlock(&pf_config_lock) #define PF_CONFIG_UNLOCK() sx_xunlock(&pf_config_lock) #define PF_CONFIG_ASSERT() sx_assert(&pf_config_lock, SA_XLOCKED) VNET_DECLARE(struct rmlock, pf_rules_lock); #define V_pf_rules_lock VNET(pf_rules_lock) #define PF_RULES_RLOCK_TRACKER struct rm_priotracker _pf_rules_tracker #define PF_RULES_RLOCK() rm_rlock(&V_pf_rules_lock, &_pf_rules_tracker) #define PF_RULES_RUNLOCK() rm_runlock(&V_pf_rules_lock, &_pf_rules_tracker) #define PF_RULES_WLOCK() rm_wlock(&V_pf_rules_lock) #define PF_RULES_WUNLOCK() rm_wunlock(&V_pf_rules_lock) #define PF_RULES_WOWNED() rm_wowned(&V_pf_rules_lock) #define PF_RULES_ASSERT() rm_assert(&V_pf_rules_lock, RA_LOCKED) #define PF_RULES_RASSERT() rm_assert(&V_pf_rules_lock, RA_RLOCKED) #define PF_RULES_WASSERT() rm_assert(&V_pf_rules_lock, RA_WLOCKED) extern struct mtx_padalign pf_table_stats_lock; #define PF_TABLE_STATS_LOCK() mtx_lock(&pf_table_stats_lock) #define PF_TABLE_STATS_UNLOCK() mtx_unlock(&pf_table_stats_lock) #define PF_TABLE_STATS_OWNED() mtx_owned(&pf_table_stats_lock) #define PF_TABLE_STATS_ASSERT() mtx_assert(&pf_table_stats_lock, MA_OWNED) extern struct sx pf_end_lock; #define PF_MODVER 1 #define PFLOG_MODVER 1 #define PFSYNC_MODVER 1 #define PFLOG_MINVER 1 #define PFLOG_PREFVER PFLOG_MODVER #define PFLOG_MAXVER 1 #define PFSYNC_MINVER 1 #define PFSYNC_PREFVER PFSYNC_MODVER #define PFSYNC_MAXVER 1 #ifdef INET #ifndef INET6 #define PF_INET_ONLY #endif /* ! INET6 */ #endif /* INET */ #ifdef INET6 #ifndef INET #define PF_INET6_ONLY #endif /* ! INET */ #endif /* INET6 */ #ifdef INET #ifdef INET6 #define PF_INET_INET6 #endif /* INET6 */ #endif /* INET */ #else #define PF_INET_INET6 #endif /* _KERNEL */ /* Both IPv4 and IPv6 */ #ifdef PF_INET_INET6 #define PF_AEQ(a, b, c) \ ((c == AF_INET && (a)->addr32[0] == (b)->addr32[0]) || \ (c == AF_INET6 && (a)->addr32[3] == (b)->addr32[3] && \ (a)->addr32[2] == (b)->addr32[2] && \ (a)->addr32[1] == (b)->addr32[1] && \ (a)->addr32[0] == (b)->addr32[0])) \ #define PF_ANEQ(a, b, c) \ ((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) || \ (c == AF_INET6 && ((a)->addr32[0] != (b)->addr32[0] || \ (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[2] != (b)->addr32[2] || \ (a)->addr32[3] != (b)->addr32[3]))) \ #define PF_AZERO(a, c) \ ((c == AF_INET && !(a)->addr32[0]) || \ (c == AF_INET6 && !(a)->addr32[0] && !(a)->addr32[1] && \ !(a)->addr32[2] && !(a)->addr32[3] )) \ #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ pf_addrcpy(a, b, f) #define PF_AINC(a, f) \ pf_addr_inc(a, f) #define PF_POOLMASK(a, b, c, d, f) \ pf_poolmask(a, b, c, d, f) #else /* Just IPv6 */ #ifdef PF_INET6_ONLY #define PF_AEQ(a, b, c) \ ((a)->addr32[3] == (b)->addr32[3] && \ (a)->addr32[2] == (b)->addr32[2] && \ (a)->addr32[1] == (b)->addr32[1] && \ (a)->addr32[0] == (b)->addr32[0]) \ #define PF_ANEQ(a, b, c) \ ((a)->addr32[3] != (b)->addr32[3] || \ (a)->addr32[2] != (b)->addr32[2] || \ (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[0] != (b)->addr32[0]) \ #define PF_AZERO(a, c) \ (!(a)->addr32[0] && \ !(a)->addr32[1] && \ !(a)->addr32[2] && \ !(a)->addr32[3] ) \ #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ pf_addrcpy(a, b, f) #define PF_AINC(a, f) \ pf_addr_inc(a, f) #define PF_POOLMASK(a, b, c, d, f) \ pf_poolmask(a, b, c, d, f) #else /* Just IPv4 */ #ifdef PF_INET_ONLY #define PF_AEQ(a, b, c) \ ((a)->addr32[0] == (b)->addr32[0]) #define PF_ANEQ(a, b, c) \ ((a)->addr32[0] != (b)->addr32[0]) #define PF_AZERO(a, c) \ (!(a)->addr32[0]) #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ (a)->v4.s_addr = (b)->v4.s_addr #define PF_AINC(a, f) \ do { \ (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \ } while (0) #define PF_POOLMASK(a, b, c, d, f) \ do { \ (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \ (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \ } while (0) #endif /* PF_INET_ONLY */ #endif /* PF_INET6_ONLY */ #endif /* PF_INET_INET6 */ /* * XXX callers not FIB-aware in our version of pf yet. * OpenBSD fixed it later it seems, 2010/05/07 13:33:16 claudio. */ #define PF_MISMATCHAW(aw, x, af, neg, ifp, rtid) \ ( \ (((aw)->type == PF_ADDR_NOROUTE && \ pf_routable((x), (af), NULL, (rtid))) || \ (((aw)->type == PF_ADDR_URPFFAILED && (ifp) != NULL && \ pf_routable((x), (af), (ifp), (rtid))) || \ ((aw)->type == PF_ADDR_TABLE && \ !pfr_match_addr((aw)->p.tbl, (x), (af))) || \ ((aw)->type == PF_ADDR_DYNIFTL && \ !pfi_match_addr((aw)->p.dyn, (x), (af))) || \ ((aw)->type == PF_ADDR_RANGE && \ !pf_match_addr_range(&(aw)->v.a.addr, \ &(aw)->v.a.mask, (x), (af))) || \ ((aw)->type == PF_ADDR_ADDRMASK && \ !PF_AZERO(&(aw)->v.a.mask, (af)) && \ !PF_MATCHA(0, &(aw)->v.a.addr, \ &(aw)->v.a.mask, (x), (af))))) != \ (neg) \ ) #define PF_ALGNMNT(off) (((off) % 2) == 0) #ifdef _KERNEL struct pf_kpooladdr { struct pf_addr_wrap addr; TAILQ_ENTRY(pf_kpooladdr) entries; char ifname[IFNAMSIZ]; struct pfi_kkif *kif; }; TAILQ_HEAD(pf_kpalist, pf_kpooladdr); struct pf_kpool { struct mtx mtx; struct pf_kpalist list; struct pf_kpooladdr *cur; struct pf_poolhashkey key; struct pf_addr counter; struct pf_mape_portset mape; int tblidx; u_int16_t proxy_port[2]; u_int8_t opts; }; struct pf_rule_actions { int32_t rtableid; uint16_t qid; uint16_t pqid; uint16_t max_mss; uint8_t log; uint8_t set_tos; uint8_t min_ttl; uint16_t dnpipe; uint16_t dnrpipe; /* Reverse direction pipe */ uint32_t flags; uint8_t set_prio[2]; }; union pf_keth_rule_ptr { struct pf_keth_rule *ptr; uint32_t nr; }; struct pf_keth_rule_addr { uint8_t addr[ETHER_ADDR_LEN]; uint8_t mask[ETHER_ADDR_LEN]; bool neg; uint8_t isset; }; struct pf_keth_anchor; TAILQ_HEAD(pf_keth_ruleq, pf_keth_rule); struct pf_keth_ruleset { struct pf_keth_ruleq rules[2]; struct pf_keth_rules { struct pf_keth_ruleq *rules; int open; uint32_t ticket; } active, inactive; struct epoch_context epoch_ctx; struct vnet *vnet; struct pf_keth_anchor *anchor; }; RB_HEAD(pf_keth_anchor_global, pf_keth_anchor); RB_HEAD(pf_keth_anchor_node, pf_keth_anchor); struct pf_keth_anchor { RB_ENTRY(pf_keth_anchor) entry_node; RB_ENTRY(pf_keth_anchor) entry_global; struct pf_keth_anchor *parent; struct pf_keth_anchor_node children; char name[PF_ANCHOR_NAME_SIZE]; char path[MAXPATHLEN]; struct pf_keth_ruleset ruleset; int refcnt; /* anchor rules */ uint8_t anchor_relative; uint8_t anchor_wildcard; }; RB_PROTOTYPE(pf_keth_anchor_node, pf_keth_anchor, entry_node, pf_keth_anchor_compare); RB_PROTOTYPE(pf_keth_anchor_global, pf_keth_anchor, entry_global, pf_keth_anchor_compare); struct pf_keth_rule { #define PFE_SKIP_IFP 0 #define PFE_SKIP_DIR 1 #define PFE_SKIP_PROTO 2 #define PFE_SKIP_SRC_ADDR 3 #define PFE_SKIP_DST_ADDR 4 #define PFE_SKIP_SRC_IP_ADDR 5 #define PFE_SKIP_DST_IP_ADDR 6 #define PFE_SKIP_COUNT 7 union pf_keth_rule_ptr skip[PFE_SKIP_COUNT]; TAILQ_ENTRY(pf_keth_rule) entries; struct pf_keth_anchor *anchor; u_int8_t anchor_relative; u_int8_t anchor_wildcard; uint32_t nr; bool quick; /* Filter */ char ifname[IFNAMSIZ]; struct pfi_kkif *kif; bool ifnot; uint8_t direction; uint16_t proto; struct pf_keth_rule_addr src, dst; struct pf_rule_addr ipsrc, ipdst; char match_tagname[PF_TAG_NAME_SIZE]; uint16_t match_tag; bool match_tag_not; /* Stats */ counter_u64_t evaluations; counter_u64_t packets[2]; counter_u64_t bytes[2]; time_t *timestamp; /* Action */ char qname[PF_QNAME_SIZE]; int qid; char tagname[PF_TAG_NAME_SIZE]; uint16_t tag; char bridge_to_name[IFNAMSIZ]; struct pfi_kkif *bridge_to; uint8_t action; uint16_t dnpipe; uint32_t dnflags; char label[PF_RULE_MAX_LABEL_COUNT][PF_RULE_LABEL_SIZE]; uint32_t ridentifier; }; union pf_krule_ptr { struct pf_krule *ptr; u_int32_t nr; }; RB_HEAD(pf_krule_global, pf_krule); RB_PROTOTYPE(pf_krule_global, pf_krule, entry_global, pf_krule_compare); struct pf_krule { struct pf_rule_addr src; struct pf_rule_addr dst; union pf_krule_ptr skip[PF_SKIP_COUNT]; char label[PF_RULE_MAX_LABEL_COUNT][PF_RULE_LABEL_SIZE]; uint32_t ridentifier; char ifname[IFNAMSIZ]; char qname[PF_QNAME_SIZE]; char pqname[PF_QNAME_SIZE]; char tagname[PF_TAG_NAME_SIZE]; char match_tagname[PF_TAG_NAME_SIZE]; char overload_tblname[PF_TABLE_NAME_SIZE]; TAILQ_ENTRY(pf_krule) entries; struct pf_kpool rpool; struct pf_counter_u64 evaluations; struct pf_counter_u64 packets[2]; struct pf_counter_u64 bytes[2]; time_t *timestamp; struct pfi_kkif *kif; struct pf_kanchor *anchor; struct pfr_ktable *overload_tbl; pf_osfp_t os_fingerprint; int32_t rtableid; u_int32_t timeout[PFTM_MAX]; u_int32_t max_states; u_int32_t max_src_nodes; u_int32_t max_src_states; u_int32_t max_src_conn; struct { u_int32_t limit; u_int32_t seconds; } max_src_conn_rate; u_int16_t qid; u_int16_t pqid; u_int16_t dnpipe; u_int16_t dnrpipe; u_int32_t free_flags; u_int32_t nr; u_int32_t prob; uid_t cuid; pid_t cpid; counter_u64_t states_cur; counter_u64_t states_tot; counter_u64_t src_nodes; u_int16_t return_icmp; u_int16_t return_icmp6; u_int16_t max_mss; u_int16_t tag; u_int16_t match_tag; u_int16_t scrub_flags; struct pf_rule_uid uid; struct pf_rule_gid gid; u_int32_t rule_flag; uint32_t rule_ref; u_int8_t action; u_int8_t direction; u_int8_t log; u_int8_t logif; u_int8_t quick; u_int8_t ifnot; u_int8_t match_tag_not; u_int8_t natpass; u_int8_t keep_state; sa_family_t af; u_int8_t proto; u_int8_t type; u_int8_t code; u_int8_t flags; u_int8_t flagset; u_int8_t min_ttl; u_int8_t allow_opts; u_int8_t rt; u_int8_t return_ttl; u_int8_t tos; u_int8_t set_tos; u_int8_t anchor_relative; u_int8_t anchor_wildcard; u_int8_t flush; u_int8_t prio; u_int8_t set_prio[2]; struct { struct pf_addr addr; u_int16_t port; } divert; u_int8_t md5sum[PF_MD5_DIGEST_LENGTH]; RB_ENTRY(pf_krule) entry_global; #ifdef PF_WANT_32_TO_64_COUNTER LIST_ENTRY(pf_krule) allrulelist; bool allrulelinked; #endif }; struct pf_krule_item { SLIST_ENTRY(pf_krule_item) entry; struct pf_krule *r; }; SLIST_HEAD(pf_krule_slist, pf_krule_item); struct pf_ksrc_node { LIST_ENTRY(pf_ksrc_node) entry; struct pf_addr addr; struct pf_addr raddr; struct pf_krule_slist match_rules; union pf_krule_ptr rule; struct pfi_kkif *kif; counter_u64_t bytes[2]; counter_u64_t packets[2]; u_int32_t states; u_int32_t conn; struct pf_threshold conn_rate; u_int32_t creation; u_int32_t expire; sa_family_t af; u_int8_t ruletype; struct mtx *lock; }; #endif struct pf_state_scrub { struct timeval pfss_last; /* time received last packet */ u_int32_t pfss_tsecr; /* last echoed timestamp */ u_int32_t pfss_tsval; /* largest timestamp */ u_int32_t pfss_tsval0; /* original timestamp */ u_int16_t pfss_flags; #define PFSS_TIMESTAMP 0x0001 /* modulate timestamp */ #define PFSS_PAWS 0x0010 /* stricter PAWS checks */ #define PFSS_PAWS_IDLED 0x0020 /* was idle too long. no PAWS */ #define PFSS_DATA_TS 0x0040 /* timestamp on data packets */ #define PFSS_DATA_NOTS 0x0080 /* no timestamp on data packets */ u_int8_t pfss_ttl; /* stashed TTL */ u_int8_t pad; u_int32_t pfss_ts_mod; /* timestamp modulation */ }; struct pf_state_host { struct pf_addr addr; u_int16_t port; u_int16_t pad; }; struct pf_state_peer { struct pf_state_scrub *scrub; /* state is scrubbed */ u_int32_t seqlo; /* Max sequence number sent */ u_int32_t seqhi; /* Max the other end ACKd + win */ u_int32_t seqdiff; /* Sequence number modulator */ u_int16_t max_win; /* largest window (pre scaling) */ u_int16_t mss; /* Maximum segment size option */ u_int8_t state; /* active state level */ u_int8_t wscale; /* window scaling factor */ u_int8_t tcp_est; /* Did we reach TCPS_ESTABLISHED */ u_int8_t pad[1]; }; /* Keep synced with struct pf_state_key. */ struct pf_state_key_cmp { struct pf_addr addr[2]; u_int16_t port[2]; sa_family_t af; u_int8_t proto; u_int8_t pad[2]; }; struct pf_state_key { struct pf_addr addr[2]; u_int16_t port[2]; sa_family_t af; u_int8_t proto; u_int8_t pad[2]; LIST_ENTRY(pf_state_key) entry; TAILQ_HEAD(, pf_kstate) states[2]; }; /* Keep synced with struct pf_kstate. */ struct pf_state_cmp { u_int64_t id; u_int32_t creatorid; u_int8_t direction; u_int8_t pad[3]; }; struct pf_state_scrub_export { uint16_t pfss_flags; uint8_t pfss_ttl; /* stashed TTL */ #define PF_SCRUB_FLAG_VALID 0x01 uint8_t scrub_flag; uint32_t pfss_ts_mod; /* timestamp modulation */ }; struct pf_state_key_export { struct pf_addr addr[2]; uint16_t port[2]; }; struct pf_state_peer_export { struct pf_state_scrub_export scrub; /* state is scrubbed */ uint32_t seqlo; /* Max sequence number sent */ uint32_t seqhi; /* Max the other end ACKd + win */ uint32_t seqdiff; /* Sequence number modulator */ uint16_t max_win; /* largest window (pre scaling) */ uint16_t mss; /* Maximum segment size option */ uint8_t state; /* active state level */ uint8_t wscale; /* window scaling factor */ uint8_t dummy[6]; }; _Static_assert(sizeof(struct pf_state_peer_export) == 32, "size incorrect"); struct pf_state_export { uint64_t version; #define PF_STATE_VERSION 20230404 uint64_t id; char ifname[IFNAMSIZ]; char orig_ifname[IFNAMSIZ]; struct pf_state_key_export key[2]; struct pf_state_peer_export src; struct pf_state_peer_export dst; struct pf_addr rt_addr; uint32_t rule; uint32_t anchor; uint32_t nat_rule; uint32_t creation; uint32_t expire; uint32_t spare0; uint64_t packets[2]; uint64_t bytes[2]; uint32_t creatorid; uint32_t spare1; sa_family_t af; uint8_t proto; uint8_t direction; uint8_t log; uint8_t state_flags_compat; uint8_t timeout; uint8_t sync_flags; uint8_t updates; uint16_t state_flags; uint16_t qid; uint16_t pqid; uint16_t dnpipe; uint16_t dnrpipe; int32_t rtableid; uint8_t min_ttl; uint8_t set_tos; uint16_t max_mss; uint8_t set_prio[2]; uint8_t rt; char rt_ifname[IFNAMSIZ]; uint8_t spare[72]; }; _Static_assert(sizeof(struct pf_state_export) == 384, "size incorrect"); #ifdef _KERNEL struct pf_kstate { /* * Area shared with pf_state_cmp */ u_int64_t id; u_int32_t creatorid; u_int8_t direction; u_int8_t pad[3]; /* * end of the area */ u_int16_t state_flags; u_int8_t timeout; u_int8_t sync_state; /* PFSYNC_S_x */ u_int8_t sync_updates; /* XXX */ u_int refs; struct mtx *lock; TAILQ_ENTRY(pf_kstate) sync_list; TAILQ_ENTRY(pf_kstate) key_list[2]; LIST_ENTRY(pf_kstate) entry; struct pf_state_peer src; struct pf_state_peer dst; struct pf_krule_slist match_rules; union pf_krule_ptr rule; union pf_krule_ptr anchor; union pf_krule_ptr nat_rule; struct pf_addr rt_addr; struct pf_state_key *key[2]; /* addresses stack and wire */ struct pfi_kkif *kif; struct pfi_kkif *orig_kif; /* The real kif, even if we're a floating state (i.e. if == V_pfi_all). */ struct pfi_kkif *rt_kif; struct pf_ksrc_node *src_node; struct pf_ksrc_node *nat_src_node; u_int64_t packets[2]; u_int64_t bytes[2]; u_int32_t creation; u_int32_t expire; u_int32_t pfsync_time; struct pf_rule_actions act; u_int16_t tag; u_int8_t rt; }; /* * Size <= fits 11 objects per page on LP64. Try to not grow the struct beyond that. */ _Static_assert(sizeof(struct pf_kstate) <= 368, "pf_kstate size crosses 368 bytes"); #endif /* * Unified state structures for pulling states out of the kernel * used by pfsync(4) and the pf(4) ioctl. */ struct pfsync_state_scrub { u_int16_t pfss_flags; u_int8_t pfss_ttl; /* stashed TTL */ #define PFSYNC_SCRUB_FLAG_VALID 0x01 u_int8_t scrub_flag; u_int32_t pfss_ts_mod; /* timestamp modulation */ } __packed; struct pfsync_state_peer { struct pfsync_state_scrub scrub; /* state is scrubbed */ u_int32_t seqlo; /* Max sequence number sent */ u_int32_t seqhi; /* Max the other end ACKd + win */ u_int32_t seqdiff; /* Sequence number modulator */ u_int16_t max_win; /* largest window (pre scaling) */ u_int16_t mss; /* Maximum segment size option */ u_int8_t state; /* active state level */ u_int8_t wscale; /* window scaling factor */ u_int8_t pad[6]; } __packed; struct pfsync_state_key { struct pf_addr addr[2]; u_int16_t port[2]; }; struct pfsync_state_1301 { u_int64_t id; char ifname[IFNAMSIZ]; struct pfsync_state_key key[2]; struct pfsync_state_peer src; struct pfsync_state_peer dst; struct pf_addr rt_addr; u_int32_t rule; u_int32_t anchor; u_int32_t nat_rule; u_int32_t creation; u_int32_t expire; u_int32_t packets[2][2]; u_int32_t bytes[2][2]; u_int32_t creatorid; sa_family_t af; u_int8_t proto; u_int8_t direction; u_int8_t __spare[2]; u_int8_t log; u_int8_t state_flags; u_int8_t timeout; u_int8_t sync_flags; u_int8_t updates; } __packed; struct pfsync_state_1400 { /* The beginning of the struct is compatible with previous versions */ u_int64_t id; char ifname[IFNAMSIZ]; struct pfsync_state_key key[2]; struct pfsync_state_peer src; struct pfsync_state_peer dst; struct pf_addr rt_addr; u_int32_t rule; u_int32_t anchor; u_int32_t nat_rule; u_int32_t creation; u_int32_t expire; u_int32_t packets[2][2]; u_int32_t bytes[2][2]; u_int32_t creatorid; sa_family_t af; u_int8_t proto; u_int8_t direction; u_int16_t state_flags; u_int8_t log; u_int8_t __spare; u_int8_t timeout; u_int8_t sync_flags; u_int8_t updates; /* The rest is not */ u_int16_t qid; u_int16_t pqid; u_int16_t dnpipe; u_int16_t dnrpipe; int32_t rtableid; u_int8_t min_ttl; u_int8_t set_tos; u_int16_t max_mss; u_int8_t set_prio[2]; u_int8_t rt; char rt_ifname[IFNAMSIZ]; } __packed; union pfsync_state_union { struct pfsync_state_1301 pfs_1301; struct pfsync_state_1400 pfs_1400; } __packed; #ifdef _KERNEL /* pfsync */ typedef int pfsync_state_import_t(union pfsync_state_union *, int, int); typedef void pfsync_insert_state_t(struct pf_kstate *); typedef void pfsync_update_state_t(struct pf_kstate *); typedef void pfsync_delete_state_t(struct pf_kstate *); typedef void pfsync_clear_states_t(u_int32_t, const char *); typedef int pfsync_defer_t(struct pf_kstate *, struct mbuf *); typedef void pfsync_detach_ifnet_t(struct ifnet *); VNET_DECLARE(pfsync_state_import_t *, pfsync_state_import_ptr); #define V_pfsync_state_import_ptr VNET(pfsync_state_import_ptr) VNET_DECLARE(pfsync_insert_state_t *, pfsync_insert_state_ptr); #define V_pfsync_insert_state_ptr VNET(pfsync_insert_state_ptr) VNET_DECLARE(pfsync_update_state_t *, pfsync_update_state_ptr); #define V_pfsync_update_state_ptr VNET(pfsync_update_state_ptr) VNET_DECLARE(pfsync_delete_state_t *, pfsync_delete_state_ptr); #define V_pfsync_delete_state_ptr VNET(pfsync_delete_state_ptr) VNET_DECLARE(pfsync_clear_states_t *, pfsync_clear_states_ptr); #define V_pfsync_clear_states_ptr VNET(pfsync_clear_states_ptr) VNET_DECLARE(pfsync_defer_t *, pfsync_defer_ptr); #define V_pfsync_defer_ptr VNET(pfsync_defer_ptr) extern pfsync_detach_ifnet_t *pfsync_detach_ifnet_ptr; void pfsync_state_export(union pfsync_state_union *, struct pf_kstate *, int); void pf_state_export(struct pf_state_export *, struct pf_kstate *); /* pflog */ struct pf_kruleset; struct pf_pdesc; typedef int pflog_packet_t(struct pfi_kkif *, struct mbuf *, sa_family_t, u_int8_t, struct pf_krule *, struct pf_krule *, struct pf_kruleset *, struct pf_pdesc *, int); extern pflog_packet_t *pflog_packet_ptr; #endif /* _KERNEL */ #define PFSYNC_FLAG_SRCNODE 0x04 #define PFSYNC_FLAG_NATSRCNODE 0x08 /* for copies to/from network byte order */ /* ioctl interface also uses network byte order */ #define pf_state_peer_hton(s,d) do { \ (d)->seqlo = htonl((s)->seqlo); \ (d)->seqhi = htonl((s)->seqhi); \ (d)->seqdiff = htonl((s)->seqdiff); \ (d)->max_win = htons((s)->max_win); \ (d)->mss = htons((s)->mss); \ (d)->state = (s)->state; \ (d)->wscale = (s)->wscale; \ if ((s)->scrub) { \ (d)->scrub.pfss_flags = \ htons((s)->scrub->pfss_flags & PFSS_TIMESTAMP); \ (d)->scrub.pfss_ttl = (s)->scrub->pfss_ttl; \ (d)->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);\ (d)->scrub.scrub_flag = PFSYNC_SCRUB_FLAG_VALID; \ } \ } while (0) #define pf_state_peer_ntoh(s,d) do { \ (d)->seqlo = ntohl((s)->seqlo); \ (d)->seqhi = ntohl((s)->seqhi); \ (d)->seqdiff = ntohl((s)->seqdiff); \ (d)->max_win = ntohs((s)->max_win); \ (d)->mss = ntohs((s)->mss); \ (d)->state = (s)->state; \ (d)->wscale = (s)->wscale; \ if ((s)->scrub.scrub_flag == PFSYNC_SCRUB_FLAG_VALID && \ (d)->scrub != NULL) { \ (d)->scrub->pfss_flags = \ ntohs((s)->scrub.pfss_flags) & PFSS_TIMESTAMP; \ (d)->scrub->pfss_ttl = (s)->scrub.pfss_ttl; \ (d)->scrub->pfss_ts_mod = ntohl((s)->scrub.pfss_ts_mod);\ } \ } while (0) #define pf_state_counter_hton(s,d) do { \ d[0] = htonl((s>>32)&0xffffffff); \ d[1] = htonl(s&0xffffffff); \ } while (0) #define pf_state_counter_from_pfsync(s) \ (((u_int64_t)(s[0])<<32) | (u_int64_t)(s[1])) #define pf_state_counter_ntoh(s,d) do { \ d = ntohl(s[0]); \ d = d<<32; \ d += ntohl(s[1]); \ } while (0) TAILQ_HEAD(pf_krulequeue, pf_krule); struct pf_kanchor; struct pf_kruleset { struct { struct pf_krulequeue queues[2]; struct { struct pf_krulequeue *ptr; struct pf_krule **ptr_array; u_int32_t rcount; u_int32_t ticket; int open; struct pf_krule_global *tree; } active, inactive; } rules[PF_RULESET_MAX]; struct pf_kanchor *anchor; u_int32_t tticket; int tables; int topen; }; RB_HEAD(pf_kanchor_global, pf_kanchor); RB_HEAD(pf_kanchor_node, pf_kanchor); struct pf_kanchor { RB_ENTRY(pf_kanchor) entry_global; RB_ENTRY(pf_kanchor) entry_node; struct pf_kanchor *parent; struct pf_kanchor_node children; char name[PF_ANCHOR_NAME_SIZE]; char path[MAXPATHLEN]; struct pf_kruleset ruleset; int refcnt; /* anchor rules */ }; RB_PROTOTYPE(pf_kanchor_global, pf_kanchor, entry_global, pf_anchor_compare); RB_PROTOTYPE(pf_kanchor_node, pf_kanchor, entry_node, pf_kanchor_compare); #define PF_RESERVED_ANCHOR "_pf" #define PFR_TFLAG_PERSIST 0x00000001 #define PFR_TFLAG_CONST 0x00000002 #define PFR_TFLAG_ACTIVE 0x00000004 #define PFR_TFLAG_INACTIVE 0x00000008 #define PFR_TFLAG_REFERENCED 0x00000010 #define PFR_TFLAG_REFDANCHOR 0x00000020 #define PFR_TFLAG_COUNTERS 0x00000040 /* Adjust masks below when adding flags. */ #define PFR_TFLAG_USRMASK (PFR_TFLAG_PERSIST | \ PFR_TFLAG_CONST | \ PFR_TFLAG_COUNTERS) #define PFR_TFLAG_SETMASK (PFR_TFLAG_ACTIVE | \ PFR_TFLAG_INACTIVE | \ PFR_TFLAG_REFERENCED | \ PFR_TFLAG_REFDANCHOR) #define PFR_TFLAG_ALLMASK (PFR_TFLAG_PERSIST | \ PFR_TFLAG_CONST | \ PFR_TFLAG_ACTIVE | \ PFR_TFLAG_INACTIVE | \ PFR_TFLAG_REFERENCED | \ PFR_TFLAG_REFDANCHOR | \ PFR_TFLAG_COUNTERS) struct pf_kanchor_stackframe; struct pf_keth_anchor_stackframe; struct pfr_table { char pfrt_anchor[MAXPATHLEN]; char pfrt_name[PF_TABLE_NAME_SIZE]; u_int32_t pfrt_flags; u_int8_t pfrt_fback; }; enum { PFR_FB_NONE, PFR_FB_MATCH, PFR_FB_ADDED, PFR_FB_DELETED, PFR_FB_CHANGED, PFR_FB_CLEARED, PFR_FB_DUPLICATE, PFR_FB_NOTMATCH, PFR_FB_CONFLICT, PFR_FB_NOCOUNT, PFR_FB_MAX }; struct pfr_addr { union { struct in_addr _pfra_ip4addr; struct in6_addr _pfra_ip6addr; } pfra_u; u_int8_t pfra_af; u_int8_t pfra_net; u_int8_t pfra_not; u_int8_t pfra_fback; }; #define pfra_ip4addr pfra_u._pfra_ip4addr #define pfra_ip6addr pfra_u._pfra_ip6addr enum { PFR_DIR_IN, PFR_DIR_OUT, PFR_DIR_MAX }; enum { PFR_OP_BLOCK, PFR_OP_PASS, PFR_OP_ADDR_MAX, PFR_OP_TABLE_MAX }; enum { PFR_TYPE_PACKETS, PFR_TYPE_BYTES, PFR_TYPE_MAX }; #define PFR_NUM_COUNTERS (PFR_DIR_MAX * PFR_OP_ADDR_MAX * PFR_TYPE_MAX) #define PFR_OP_XPASS PFR_OP_ADDR_MAX struct pfr_astats { struct pfr_addr pfras_a; u_int64_t pfras_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; u_int64_t pfras_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; long pfras_tzero; }; enum { PFR_REFCNT_RULE, PFR_REFCNT_ANCHOR, PFR_REFCNT_MAX }; struct pfr_tstats { struct pfr_table pfrts_t; u_int64_t pfrts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; u_int64_t pfrts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; u_int64_t pfrts_match; u_int64_t pfrts_nomatch; long pfrts_tzero; int pfrts_cnt; int pfrts_refcnt[PFR_REFCNT_MAX]; }; #ifdef _KERNEL struct pfr_kstate_counter { counter_u64_t pkc_pcpu; u_int64_t pkc_zero; }; static inline int pfr_kstate_counter_init(struct pfr_kstate_counter *pfrc, int flags) { pfrc->pkc_zero = 0; pfrc->pkc_pcpu = counter_u64_alloc(flags); if (pfrc->pkc_pcpu == NULL) return (ENOMEM); return (0); } static inline void pfr_kstate_counter_deinit(struct pfr_kstate_counter *pfrc) { counter_u64_free(pfrc->pkc_pcpu); } static inline u_int64_t pfr_kstate_counter_fetch(struct pfr_kstate_counter *pfrc) { u_int64_t c; c = counter_u64_fetch(pfrc->pkc_pcpu); c -= pfrc->pkc_zero; return (c); } static inline void pfr_kstate_counter_zero(struct pfr_kstate_counter *pfrc) { u_int64_t c; c = counter_u64_fetch(pfrc->pkc_pcpu); pfrc->pkc_zero = c; } static inline void pfr_kstate_counter_add(struct pfr_kstate_counter *pfrc, int64_t n) { counter_u64_add(pfrc->pkc_pcpu, n); } struct pfr_ktstats { struct pfr_table pfrts_t; struct pfr_kstate_counter pfrkts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; struct pfr_kstate_counter pfrkts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; struct pfr_kstate_counter pfrkts_match; struct pfr_kstate_counter pfrkts_nomatch; long pfrkts_tzero; int pfrkts_cnt; int pfrkts_refcnt[PFR_REFCNT_MAX]; }; #endif /* _KERNEL */ #define pfrts_name pfrts_t.pfrt_name #define pfrts_flags pfrts_t.pfrt_flags #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ struct pfr_kcounters { counter_u64_t pfrkc_counters; long pfrkc_tzero; }; #define pfr_kentry_counter(kc, dir, op, t) \ ((kc)->pfrkc_counters + \ (dir) * PFR_OP_ADDR_MAX * PFR_TYPE_MAX + (op) * PFR_TYPE_MAX + (t)) #ifdef _KERNEL SLIST_HEAD(pfr_kentryworkq, pfr_kentry); struct pfr_kentry { struct radix_node pfrke_node[2]; union sockaddr_union pfrke_sa; SLIST_ENTRY(pfr_kentry) pfrke_workq; struct pfr_kcounters pfrke_counters; u_int8_t pfrke_af; u_int8_t pfrke_net; u_int8_t pfrke_not; u_int8_t pfrke_mark; }; SLIST_HEAD(pfr_ktableworkq, pfr_ktable); RB_HEAD(pfr_ktablehead, pfr_ktable); struct pfr_ktable { struct pfr_ktstats pfrkt_kts; RB_ENTRY(pfr_ktable) pfrkt_tree; SLIST_ENTRY(pfr_ktable) pfrkt_workq; struct radix_node_head *pfrkt_ip4; struct radix_node_head *pfrkt_ip6; struct pfr_ktable *pfrkt_shadow; struct pfr_ktable *pfrkt_root; struct pf_kruleset *pfrkt_rs; long pfrkt_larg; int pfrkt_nflags; }; #define pfrkt_t pfrkt_kts.pfrts_t #define pfrkt_name pfrkt_t.pfrt_name #define pfrkt_anchor pfrkt_t.pfrt_anchor #define pfrkt_ruleset pfrkt_t.pfrt_ruleset #define pfrkt_flags pfrkt_t.pfrt_flags #define pfrkt_cnt pfrkt_kts.pfrkts_cnt #define pfrkt_refcnt pfrkt_kts.pfrkts_refcnt #define pfrkt_packets pfrkt_kts.pfrkts_packets #define pfrkt_bytes pfrkt_kts.pfrkts_bytes #define pfrkt_match pfrkt_kts.pfrkts_match #define pfrkt_nomatch pfrkt_kts.pfrkts_nomatch #define pfrkt_tzero pfrkt_kts.pfrkts_tzero #endif #ifdef _KERNEL struct pfi_kkif { char pfik_name[IFNAMSIZ]; union { RB_ENTRY(pfi_kkif) _pfik_tree; LIST_ENTRY(pfi_kkif) _pfik_list; } _pfik_glue; #define pfik_tree _pfik_glue._pfik_tree #define pfik_list _pfik_glue._pfik_list struct pf_counter_u64 pfik_packets[2][2][2]; struct pf_counter_u64 pfik_bytes[2][2][2]; u_int32_t pfik_tzero; u_int pfik_flags; struct ifnet *pfik_ifp; struct ifg_group *pfik_group; u_int pfik_rulerefs; TAILQ_HEAD(, pfi_dynaddr) pfik_dynaddrs; #ifdef PF_WANT_32_TO_64_COUNTER LIST_ENTRY(pfi_kkif) pfik_allkiflist; #endif }; #endif #define PFI_IFLAG_REFS 0x0001 /* has state references */ #define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */ #ifdef _KERNEL struct pf_pdesc { struct { int done; uid_t uid; gid_t gid; } lookup; u_int64_t tot_len; /* Make Mickey money */ union pf_headers { struct tcphdr tcp; struct udphdr udp; + struct sctphdr sctp; struct icmp icmp; #ifdef INET6 struct icmp6_hdr icmp6; #endif /* INET6 */ char any[0]; } hdr; struct pf_krule *nat_rule; /* nat/rdr rule applied to packet */ struct pf_addr *src; /* src address */ struct pf_addr *dst; /* dst address */ u_int16_t *sport; u_int16_t *dport; struct pf_mtag *pf_mtag; struct pf_rule_actions act; u_int32_t p_len; /* total length of payload */ u_int16_t *ip_sum; u_int16_t *proto_sum; u_int16_t flags; /* Let SCRUB trigger behavior in * state code. Easier than tags */ #define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */ #define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */ sa_family_t af; u_int8_t proto; u_int8_t tos; u_int8_t dir; /* direction */ u_int8_t sidx; /* key index for source */ u_int8_t didx; /* key index for destination */ +#define PFDESC_SCTP_INIT 0x0001 +#define PFDESC_SCTP_INIT_ACK 0x0002 +#define PFDESC_SCTP_COOKIE 0x0004 +#define PFDESC_SCTP_ABORT 0x0008 +#define PFDESC_SCTP_SHUTDOWN 0x0010 +#define PFDESC_SCTP_SHUTDOWN_COMPLETE 0x0020 +#define PFDESC_SCTP_DATA 0x0040 +#define PFDESC_SCTP_OTHER 0x0080 + u_int16_t sctp_flags; }; #endif /* flags for RDR options */ #define PF_DPORT_RANGE 0x01 /* Dest port uses range */ #define PF_RPORT_RANGE 0x02 /* RDR'ed port uses range */ /* UDP state enumeration */ #define PFUDPS_NO_TRAFFIC 0 #define PFUDPS_SINGLE 1 #define PFUDPS_MULTIPLE 2 #define PFUDPS_NSTATES 3 /* number of state levels */ #define PFUDPS_NAMES { \ "NO_TRAFFIC", \ "SINGLE", \ "MULTIPLE", \ NULL \ } /* Other protocol state enumeration */ #define PFOTHERS_NO_TRAFFIC 0 #define PFOTHERS_SINGLE 1 #define PFOTHERS_MULTIPLE 2 #define PFOTHERS_NSTATES 3 /* number of state levels */ #define PFOTHERS_NAMES { \ "NO_TRAFFIC", \ "SINGLE", \ "MULTIPLE", \ NULL \ } #define ACTION_SET(a, x) \ do { \ if ((a) != NULL) \ *(a) = (x); \ } while (0) #define REASON_SET(a, x) \ do { \ if ((a) != NULL) \ *(a) = (x); \ if (x < PFRES_MAX) \ counter_u64_add(V_pf_status.counters[x], 1); \ } while (0) enum pf_syncookies_mode { PF_SYNCOOKIES_NEVER = 0, PF_SYNCOOKIES_ALWAYS = 1, PF_SYNCOOKIES_ADAPTIVE = 2, PF_SYNCOOKIES_MODE_MAX = PF_SYNCOOKIES_ADAPTIVE }; #define PF_SYNCOOKIES_HIWATPCT 25 #define PF_SYNCOOKIES_LOWATPCT (PF_SYNCOOKIES_HIWATPCT / 2) #ifdef _KERNEL struct pf_kstatus { counter_u64_t counters[PFRES_MAX]; /* reason for passing/dropping */ counter_u64_t lcounters[KLCNT_MAX]; /* limit counters */ struct pf_counter_u64 fcounters[FCNT_MAX]; /* state operation counters */ counter_u64_t scounters[SCNT_MAX]; /* src_node operation counters */ uint32_t states; uint32_t src_nodes; uint32_t running; uint32_t since; uint32_t debug; uint32_t hostid; char ifname[IFNAMSIZ]; uint8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; bool keep_counters; enum pf_syncookies_mode syncookies_mode; bool syncookies_active; uint64_t syncookies_inflight[2]; uint32_t states_halfopen; uint32_t reass; }; #endif struct pf_divert { union { struct in_addr ipv4; struct in6_addr ipv6; } addr; u_int16_t port; }; #define PFFRAG_FRENT_HIWAT 5000 /* Number of fragment entries */ #define PFR_KENTRY_HIWAT 200000 /* Number of table entries */ /* * Limit the length of the fragment queue traversal. Remember * search entry points based on the fragment offset. */ #define PF_FRAG_ENTRY_POINTS 16 /* * The number of entries in the fragment queue must be limited * to avoid DoS by linear searching. Instead of a global limit, * use a limit per entry point. For large packets these sum up. */ #define PF_FRAG_ENTRY_LIMIT 64 /* * ioctl parameter structures */ struct pfioc_pooladdr { u_int32_t action; u_int32_t ticket; u_int32_t nr; u_int32_t r_num; u_int8_t r_action; u_int8_t r_last; u_int8_t af; char anchor[MAXPATHLEN]; struct pf_pooladdr addr; }; struct pfioc_rule { u_int32_t action; u_int32_t ticket; u_int32_t pool_ticket; u_int32_t nr; char anchor[MAXPATHLEN]; char anchor_call[MAXPATHLEN]; struct pf_rule rule; }; struct pfioc_natlook { struct pf_addr saddr; struct pf_addr daddr; struct pf_addr rsaddr; struct pf_addr rdaddr; u_int16_t sport; u_int16_t dport; u_int16_t rsport; u_int16_t rdport; sa_family_t af; u_int8_t proto; u_int8_t direction; }; struct pfioc_state { struct pfsync_state_1301 state; }; struct pfioc_src_node_kill { sa_family_t psnk_af; struct pf_rule_addr psnk_src; struct pf_rule_addr psnk_dst; u_int psnk_killed; }; #ifdef _KERNEL struct pf_kstate_kill { struct pf_state_cmp psk_pfcmp; sa_family_t psk_af; int psk_proto; struct pf_rule_addr psk_src; struct pf_rule_addr psk_dst; struct pf_rule_addr psk_rt_addr; char psk_ifname[IFNAMSIZ]; char psk_label[PF_RULE_LABEL_SIZE]; u_int psk_killed; bool psk_kill_match; }; #endif struct pfioc_state_kill { struct pf_state_cmp psk_pfcmp; sa_family_t psk_af; int psk_proto; struct pf_rule_addr psk_src; struct pf_rule_addr psk_dst; char psk_ifname[IFNAMSIZ]; char psk_label[PF_RULE_LABEL_SIZE]; u_int psk_killed; }; struct pfioc_states { int ps_len; union { void *ps_buf; struct pfsync_state_1301 *ps_states; }; }; struct pfioc_states_v2 { int ps_len; uint64_t ps_req_version; union { void *ps_buf; struct pf_state_export *ps_states; }; }; struct pfioc_src_nodes { int psn_len; union { void *psn_buf; struct pf_src_node *psn_src_nodes; }; }; struct pfioc_if { char ifname[IFNAMSIZ]; }; struct pfioc_tm { int timeout; int seconds; }; struct pfioc_limit { int index; unsigned limit; }; struct pfioc_altq_v0 { u_int32_t action; u_int32_t ticket; u_int32_t nr; struct pf_altq_v0 altq; }; struct pfioc_altq_v1 { u_int32_t action; u_int32_t ticket; u_int32_t nr; /* * Placed here so code that only uses the above parameters can be * written entirely in terms of the v0 or v1 type. */ u_int32_t version; struct pf_altq_v1 altq; }; /* * Latest version of struct pfioc_altq_vX. This must move in lock-step with * the latest version of struct pf_altq_vX as it has that struct as a * member. */ #define PFIOC_ALTQ_VERSION PF_ALTQ_VERSION struct pfioc_qstats_v0 { u_int32_t ticket; u_int32_t nr; void *buf; int nbytes; u_int8_t scheduler; }; struct pfioc_qstats_v1 { u_int32_t ticket; u_int32_t nr; void *buf; int nbytes; u_int8_t scheduler; /* * Placed here so code that only uses the above parameters can be * written entirely in terms of the v0 or v1 type. */ u_int32_t version; /* Requested version of stats struct */ }; /* Latest version of struct pfioc_qstats_vX */ #define PFIOC_QSTATS_VERSION 1 struct pfioc_ruleset { u_int32_t nr; char path[MAXPATHLEN]; char name[PF_ANCHOR_NAME_SIZE]; }; #define PF_RULESET_ALTQ (PF_RULESET_MAX) #define PF_RULESET_TABLE (PF_RULESET_MAX+1) #define PF_RULESET_ETH (PF_RULESET_MAX+2) struct pfioc_trans { int size; /* number of elements */ int esize; /* size of each element in bytes */ struct pfioc_trans_e { int rs_num; char anchor[MAXPATHLEN]; u_int32_t ticket; } *array; }; #define PFR_FLAG_ATOMIC 0x00000001 /* unused */ #define PFR_FLAG_DUMMY 0x00000002 #define PFR_FLAG_FEEDBACK 0x00000004 #define PFR_FLAG_CLSTATS 0x00000008 #define PFR_FLAG_ADDRSTOO 0x00000010 #define PFR_FLAG_REPLACE 0x00000020 #define PFR_FLAG_ALLRSETS 0x00000040 #define PFR_FLAG_ALLMASK 0x0000007F #ifdef _KERNEL #define PFR_FLAG_USERIOCTL 0x10000000 #endif struct pfioc_table { struct pfr_table pfrio_table; void *pfrio_buffer; int pfrio_esize; int pfrio_size; int pfrio_size2; int pfrio_nadd; int pfrio_ndel; int pfrio_nchange; int pfrio_flags; u_int32_t pfrio_ticket; }; #define pfrio_exists pfrio_nadd #define pfrio_nzero pfrio_nadd #define pfrio_nmatch pfrio_nadd #define pfrio_naddr pfrio_size2 #define pfrio_setflag pfrio_size2 #define pfrio_clrflag pfrio_nadd struct pfioc_iface { char pfiio_name[IFNAMSIZ]; void *pfiio_buffer; int pfiio_esize; int pfiio_size; int pfiio_nzero; int pfiio_flags; }; /* * ioctl operations */ #define DIOCSTART _IO ('D', 1) #define DIOCSTOP _IO ('D', 2) #define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) #define DIOCADDRULENV _IOWR('D', 4, struct pfioc_nv) #define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) #define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) #define DIOCGETRULENV _IOWR('D', 7, struct pfioc_nv) /* XXX cut 8 - 17 */ #define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) #define DIOCCLRSTATESNV _IOWR('D', 18, struct pfioc_nv) #define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) #define DIOCGETSTATENV _IOWR('D', 19, struct pfioc_nv) #define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) #define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) #define DIOCGETSTATUSNV _IOWR('D', 21, struct pfioc_nv) #define DIOCCLRSTATUS _IO ('D', 22) #define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) #define DIOCSETDEBUG _IOWR('D', 24, u_int32_t) #define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states) #define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule) /* XXX cut 26 - 28 */ #define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm) #define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm) #define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state) #define DIOCCLRRULECTRS _IO ('D', 38) #define DIOCGETLIMIT _IOWR('D', 39, struct pfioc_limit) #define DIOCSETLIMIT _IOWR('D', 40, struct pfioc_limit) #define DIOCKILLSTATES _IOWR('D', 41, struct pfioc_state_kill) #define DIOCKILLSTATESNV _IOWR('D', 41, struct pfioc_nv) #define DIOCSTARTALTQ _IO ('D', 42) #define DIOCSTOPALTQ _IO ('D', 43) #define DIOCADDALTQV0 _IOWR('D', 45, struct pfioc_altq_v0) #define DIOCADDALTQV1 _IOWR('D', 45, struct pfioc_altq_v1) #define DIOCGETALTQSV0 _IOWR('D', 47, struct pfioc_altq_v0) #define DIOCGETALTQSV1 _IOWR('D', 47, struct pfioc_altq_v1) #define DIOCGETALTQV0 _IOWR('D', 48, struct pfioc_altq_v0) #define DIOCGETALTQV1 _IOWR('D', 48, struct pfioc_altq_v1) #define DIOCCHANGEALTQV0 _IOWR('D', 49, struct pfioc_altq_v0) #define DIOCCHANGEALTQV1 _IOWR('D', 49, struct pfioc_altq_v1) #define DIOCGETQSTATSV0 _IOWR('D', 50, struct pfioc_qstats_v0) #define DIOCGETQSTATSV1 _IOWR('D', 50, struct pfioc_qstats_v1) #define DIOCBEGINADDRS _IOWR('D', 51, struct pfioc_pooladdr) #define DIOCADDADDR _IOWR('D', 52, struct pfioc_pooladdr) #define DIOCGETADDRS _IOWR('D', 53, struct pfioc_pooladdr) #define DIOCGETADDR _IOWR('D', 54, struct pfioc_pooladdr) #define DIOCCHANGEADDR _IOWR('D', 55, struct pfioc_pooladdr) /* XXX cut 55 - 57 */ #define DIOCGETRULESETS _IOWR('D', 58, struct pfioc_ruleset) #define DIOCGETRULESET _IOWR('D', 59, struct pfioc_ruleset) #define DIOCRCLRTABLES _IOWR('D', 60, struct pfioc_table) #define DIOCRADDTABLES _IOWR('D', 61, struct pfioc_table) #define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) #define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) #define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) #define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) #define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) #define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) #define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) #define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) #define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) #define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) #define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) #define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) #define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) #define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) #define DIOCOSFPFLUSH _IO('D', 78) #define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) #define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) #define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) #define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) #define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) #define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) #define DIOCCLRSRCNODES _IO('D', 85) #define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) #define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface) #define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface) #define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface) #define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) #define DIOCGIFSPEEDV0 _IOWR('D', 92, struct pf_ifspeed_v0) #define DIOCGIFSPEEDV1 _IOWR('D', 92, struct pf_ifspeed_v1) #define DIOCGETSTATESV2 _IOWR('D', 93, struct pfioc_states_v2) #define DIOCGETSYNCOOKIES _IOWR('D', 94, struct pfioc_nv) #define DIOCSETSYNCOOKIES _IOWR('D', 95, struct pfioc_nv) #define DIOCKEEPCOUNTERS _IOWR('D', 96, struct pfioc_nv) #define DIOCKEEPCOUNTERS_FREEBSD13 _IOWR('D', 92, struct pfioc_nv) #define DIOCADDETHRULE _IOWR('D', 97, struct pfioc_nv) #define DIOCGETETHRULE _IOWR('D', 98, struct pfioc_nv) #define DIOCGETETHRULES _IOWR('D', 99, struct pfioc_nv) #define DIOCGETETHRULESETS _IOWR('D', 100, struct pfioc_nv) #define DIOCGETETHRULESET _IOWR('D', 101, struct pfioc_nv) #define DIOCSETREASS _IOWR('D', 102, u_int32_t) struct pf_ifspeed_v0 { char ifname[IFNAMSIZ]; u_int32_t baudrate; }; struct pf_ifspeed_v1 { char ifname[IFNAMSIZ]; u_int32_t baudrate32; /* layout identical to struct pf_ifspeed_v0 up to this point */ u_int64_t baudrate; }; /* Latest version of struct pf_ifspeed_vX */ #define PF_IFSPEED_VERSION 1 /* * Compatibility and convenience macros */ #ifndef _KERNEL #ifdef PFIOC_USE_LATEST /* * Maintaining in-tree consumers of the ioctl interface is easier when that * code can be written in terms old names that refer to the latest interface * version as that reduces the required changes in the consumers to those * that are functionally necessary to accommodate a new interface version. */ #define pfioc_altq __CONCAT(pfioc_altq_v, PFIOC_ALTQ_VERSION) #define pfioc_qstats __CONCAT(pfioc_qstats_v, PFIOC_QSTATS_VERSION) #define pf_ifspeed __CONCAT(pf_ifspeed_v, PF_IFSPEED_VERSION) #define DIOCADDALTQ __CONCAT(DIOCADDALTQV, PFIOC_ALTQ_VERSION) #define DIOCGETALTQS __CONCAT(DIOCGETALTQSV, PFIOC_ALTQ_VERSION) #define DIOCGETALTQ __CONCAT(DIOCGETALTQV, PFIOC_ALTQ_VERSION) #define DIOCCHANGEALTQ __CONCAT(DIOCCHANGEALTQV, PFIOC_ALTQ_VERSION) #define DIOCGETQSTATS __CONCAT(DIOCGETQSTATSV, PFIOC_QSTATS_VERSION) #define DIOCGIFSPEED __CONCAT(DIOCGIFSPEEDV, PF_IFSPEED_VERSION) #else /* * When building out-of-tree code that is written for the old interface, * such as may exist in ports for example, resolve the old struct tags and * ioctl command names to the v0 versions. */ #define pfioc_altq __CONCAT(pfioc_altq_v, 0) #define pfioc_qstats __CONCAT(pfioc_qstats_v, 0) #define pf_ifspeed __CONCAT(pf_ifspeed_v, 0) #define DIOCADDALTQ __CONCAT(DIOCADDALTQV, 0) #define DIOCGETALTQS __CONCAT(DIOCGETALTQSV, 0) #define DIOCGETALTQ __CONCAT(DIOCGETALTQV, 0) #define DIOCCHANGEALTQ __CONCAT(DIOCCHANGEALTQV, 0) #define DIOCGETQSTATS __CONCAT(DIOCGETQSTATSV, 0) #define DIOCGIFSPEED __CONCAT(DIOCGIFSPEEDV, 0) #endif /* PFIOC_USE_LATEST */ #endif /* _KERNEL */ #ifdef _KERNEL LIST_HEAD(pf_ksrc_node_list, pf_ksrc_node); struct pf_srchash { struct pf_ksrc_node_list nodes; struct mtx lock; }; struct pf_keyhash { LIST_HEAD(, pf_state_key) keys; struct mtx lock; }; struct pf_idhash { LIST_HEAD(, pf_kstate) states; struct mtx lock; }; extern u_long pf_ioctl_maxcount; extern u_long pf_hashmask; extern u_long pf_srchashmask; #define PF_HASHSIZ (131072) #define PF_SRCHASHSIZ (PF_HASHSIZ/4) VNET_DECLARE(struct pf_keyhash *, pf_keyhash); VNET_DECLARE(struct pf_idhash *, pf_idhash); #define V_pf_keyhash VNET(pf_keyhash) #define V_pf_idhash VNET(pf_idhash) VNET_DECLARE(struct pf_srchash *, pf_srchash); #define V_pf_srchash VNET(pf_srchash) #define PF_IDHASH(s) (be64toh((s)->id) % (pf_hashmask + 1)) VNET_DECLARE(void *, pf_swi_cookie); #define V_pf_swi_cookie VNET(pf_swi_cookie) VNET_DECLARE(struct intr_event *, pf_swi_ie); #define V_pf_swi_ie VNET(pf_swi_ie) VNET_DECLARE(struct unrhdr64, pf_stateid); #define V_pf_stateid VNET(pf_stateid) TAILQ_HEAD(pf_altqqueue, pf_altq); VNET_DECLARE(struct pf_altqqueue, pf_altqs[4]); #define V_pf_altqs VNET(pf_altqs) VNET_DECLARE(struct pf_kpalist, pf_pabuf); #define V_pf_pabuf VNET(pf_pabuf) VNET_DECLARE(u_int32_t, ticket_altqs_active); #define V_ticket_altqs_active VNET(ticket_altqs_active) VNET_DECLARE(u_int32_t, ticket_altqs_inactive); #define V_ticket_altqs_inactive VNET(ticket_altqs_inactive) VNET_DECLARE(int, altqs_inactive_open); #define V_altqs_inactive_open VNET(altqs_inactive_open) VNET_DECLARE(u_int32_t, ticket_pabuf); #define V_ticket_pabuf VNET(ticket_pabuf) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_active); #define V_pf_altqs_active VNET(pf_altqs_active) VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_active); #define V_pf_altq_ifs_active VNET(pf_altq_ifs_active) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_inactive); #define V_pf_altqs_inactive VNET(pf_altqs_inactive) VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_inactive); #define V_pf_altq_ifs_inactive VNET(pf_altq_ifs_inactive) VNET_DECLARE(struct pf_krulequeue, pf_unlinked_rules); #define V_pf_unlinked_rules VNET(pf_unlinked_rules) #ifdef PF_WANT_32_TO_64_COUNTER LIST_HEAD(allkiflist_head, pfi_kkif); VNET_DECLARE(struct allkiflist_head, pf_allkiflist); #define V_pf_allkiflist VNET(pf_allkiflist) VNET_DECLARE(size_t, pf_allkifcount); #define V_pf_allkifcount VNET(pf_allkifcount) VNET_DECLARE(struct pfi_kkif *, pf_kifmarker); #define V_pf_kifmarker VNET(pf_kifmarker) LIST_HEAD(allrulelist_head, pf_krule); VNET_DECLARE(struct allrulelist_head, pf_allrulelist); #define V_pf_allrulelist VNET(pf_allrulelist) VNET_DECLARE(size_t, pf_allrulecount); #define V_pf_allrulecount VNET(pf_allrulecount) VNET_DECLARE(struct pf_krule *, pf_rulemarker); #define V_pf_rulemarker VNET(pf_rulemarker) #endif void pf_initialize(void); void pf_mtag_initialize(void); void pf_mtag_cleanup(void); void pf_cleanup(void); struct pf_mtag *pf_get_mtag(struct mbuf *); extern void pf_calc_skip_steps(struct pf_krulequeue *); #ifdef ALTQ extern void pf_altq_ifnet_event(struct ifnet *, int); #endif VNET_DECLARE(uma_zone_t, pf_state_z); #define V_pf_state_z VNET(pf_state_z) VNET_DECLARE(uma_zone_t, pf_state_key_z); #define V_pf_state_key_z VNET(pf_state_key_z) VNET_DECLARE(uma_zone_t, pf_state_scrub_z); #define V_pf_state_scrub_z VNET(pf_state_scrub_z) extern void pf_purge_thread(void *); extern void pf_unload_vnet_purge(void); extern void pf_intr(void *); extern void pf_purge_expired_src_nodes(void); extern int pf_unlink_state(struct pf_kstate *); extern int pf_state_insert(struct pfi_kkif *, struct pfi_kkif *, struct pf_state_key *, struct pf_state_key *, struct pf_kstate *); extern struct pf_kstate *pf_alloc_state(int); extern void pf_free_state(struct pf_kstate *); static __inline void pf_ref_state(struct pf_kstate *s) { refcount_acquire(&s->refs); } static __inline int pf_release_state(struct pf_kstate *s) { if (refcount_release(&s->refs)) { pf_free_state(s); return (1); } else return (0); } static __inline int pf_release_staten(struct pf_kstate *s, u_int n) { if (refcount_releasen(&s->refs, n)) { pf_free_state(s); return (1); } else return (0); } extern struct pf_kstate *pf_find_state_byid(uint64_t, uint32_t); extern struct pf_kstate *pf_find_state_all(struct pf_state_key_cmp *, u_int, int *); extern bool pf_find_state_all_exists(struct pf_state_key_cmp *, u_int); extern struct pf_ksrc_node *pf_find_src_node(struct pf_addr *, struct pf_krule *, sa_family_t, struct pf_srchash **, bool); extern void pf_unlink_src_node(struct pf_ksrc_node *); extern u_int pf_free_src_nodes(struct pf_ksrc_node_list *); extern void pf_print_state(struct pf_kstate *); extern void pf_print_flags(u_int8_t); extern int pf_addr_wrap_neq(struct pf_addr_wrap *, struct pf_addr_wrap *); extern u_int16_t pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t, u_int8_t); extern u_int16_t pf_proto_cksum_fixup(struct mbuf *, u_int16_t, u_int16_t, u_int16_t, u_int8_t); VNET_DECLARE(struct ifnet *, sync_ifp); #define V_sync_ifp VNET(sync_ifp); VNET_DECLARE(struct pf_krule, pf_default_rule); #define V_pf_default_rule VNET(pf_default_rule) extern void pf_addrcpy(struct pf_addr *, struct pf_addr *, sa_family_t); void pf_free_rule(struct pf_krule *); int pf_test_eth(int, int, struct ifnet *, struct mbuf **, struct inpcb *); #ifdef INET int pf_test(int, int, struct ifnet *, struct mbuf **, struct inpcb *, struct pf_rule_actions *); int pf_normalize_ip(struct mbuf **, struct pfi_kkif *, u_short *, struct pf_pdesc *); #endif /* INET */ #ifdef INET6 int pf_test6(int, int, struct ifnet *, struct mbuf **, struct inpcb *, struct pf_rule_actions *); int pf_normalize_ip6(struct mbuf **, struct pfi_kkif *, u_short *, struct pf_pdesc *); void pf_poolmask(struct pf_addr *, struct pf_addr*, struct pf_addr *, struct pf_addr *, sa_family_t); void pf_addr_inc(struct pf_addr *, sa_family_t); int pf_refragment6(struct ifnet *, struct mbuf **, struct m_tag *, bool); #endif /* INET6 */ u_int32_t pf_new_isn(struct pf_kstate *); void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, sa_family_t); void pf_change_a(void *, u_int16_t *, u_int32_t, u_int8_t); void pf_change_proto_a(struct mbuf *, void *, u_int16_t *, u_int32_t, u_int8_t); void pf_change_tcp_a(struct mbuf *, void *, u_int16_t *, u_int32_t); void pf_patch_16_unaligned(struct mbuf *, u_int16_t *, void *, u_int16_t, bool, u_int8_t); void pf_patch_32_unaligned(struct mbuf *, u_int16_t *, void *, u_int32_t, bool, u_int8_t); void pf_send_deferred_syn(struct pf_kstate *); int pf_match_addr(u_int8_t, struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); int pf_match_addr_range(struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); void pf_normalize_init(void); void pf_normalize_cleanup(void); int pf_normalize_tcp(struct pfi_kkif *, struct mbuf *, int, int, void *, struct pf_pdesc *); void pf_normalize_tcp_cleanup(struct pf_kstate *); int pf_normalize_tcp_init(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *, struct pf_state_peer *); int pf_normalize_tcp_stateful(struct mbuf *, int, struct pf_pdesc *, u_short *, struct tcphdr *, struct pf_kstate *, struct pf_state_peer *, struct pf_state_peer *, int *); +int pf_normalize_sctp(int, struct pfi_kkif *, struct mbuf *, int, + int, void *, struct pf_pdesc *); u_int32_t pf_state_expires(const struct pf_kstate *); void pf_purge_expired_fragments(void); void pf_purge_fragments(uint32_t); int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *, int); int pf_socket_lookup(struct pf_pdesc *, struct mbuf *); struct pf_state_key *pf_alloc_state_key(int); void pfr_initialize(void); void pfr_cleanup(void); int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, u_int64_t, int, int, int); int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t); void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); struct pfr_ktable * pfr_attach_table(struct pf_kruleset *, char *); struct pfr_ktable * pfr_eth_attach_table(struct pf_keth_ruleset *, char *); void pfr_detach_table(struct pfr_ktable *); int pfr_clr_tables(struct pfr_table *, int *, int); int pfr_add_tables(struct pfr_table *, int, int *, int); int pfr_del_tables(struct pfr_table *, int, int *, int); int pfr_table_count(struct pfr_table *, int); int pfr_get_tables(struct pfr_table *, struct pfr_table *, int *, int); int pfr_get_tstats(struct pfr_table *, struct pfr_tstats *, int *, int); int pfr_clr_tstats(struct pfr_table *, int, int *, int); int pfr_set_tflags(struct pfr_table *, int, int, int, int *, int *, int); int pfr_clr_addrs(struct pfr_table *, int *, int); int pfr_insert_kentry(struct pfr_ktable *, struct pfr_addr *, long); int pfr_add_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_del_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_set_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int *, int *, int *, int, u_int32_t); int pfr_get_addrs(struct pfr_table *, struct pfr_addr *, int *, int); int pfr_get_astats(struct pfr_table *, struct pfr_astats *, int *, int); int pfr_clr_astats(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_tst_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_ina_begin(struct pfr_table *, u_int32_t *, int *, int); int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, int *, u_int32_t, int); MALLOC_DECLARE(PFI_MTYPE); VNET_DECLARE(struct pfi_kkif *, pfi_all); #define V_pfi_all VNET(pfi_all) void pfi_initialize(void); void pfi_initialize_vnet(void); void pfi_cleanup(void); void pfi_cleanup_vnet(void); void pfi_kkif_ref(struct pfi_kkif *); void pfi_kkif_unref(struct pfi_kkif *); struct pfi_kkif *pfi_kkif_find(const char *); struct pfi_kkif *pfi_kkif_attach(struct pfi_kkif *, const char *); int pfi_kkif_match(struct pfi_kkif *, struct pfi_kkif *); void pfi_kkif_purge(void); int pfi_match_addr(struct pfi_dynaddr *, struct pf_addr *, sa_family_t); int pfi_dynaddr_setup(struct pf_addr_wrap *, sa_family_t); void pfi_dynaddr_remove(struct pfi_dynaddr *); void pfi_dynaddr_copyout(struct pf_addr_wrap *); void pfi_update_status(const char *, struct pf_status *); void pfi_get_ifaces(const char *, struct pfi_kif *, int *); int pfi_set_flags(const char *, int); int pfi_clear_flags(const char *, int); int pf_match_tag(struct mbuf *, struct pf_krule *, int *, int); int pf_tag_packet(struct mbuf *, struct pf_pdesc *, int); int pf_addr_cmp(struct pf_addr *, struct pf_addr *, sa_family_t); u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, sa_family_t); u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, sa_family_t); struct mbuf *pf_build_tcp(const struct pf_krule *, sa_family_t, const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, bool, u_int16_t, u_int16_t, int); void pf_send_tcp(const struct pf_krule *, sa_family_t, const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, bool, u_int16_t, u_int16_t, int); void pf_syncookies_init(void); void pf_syncookies_cleanup(void); int pf_get_syncookies(struct pfioc_nv *); int pf_set_syncookies(struct pfioc_nv *); int pf_synflood_check(struct pf_pdesc *); void pf_syncookie_send(struct mbuf *m, int off, struct pf_pdesc *); bool pf_syncookie_check(struct pf_pdesc *); u_int8_t pf_syncookie_validate(struct pf_pdesc *); struct mbuf * pf_syncookie_recreate_syn(uint8_t, int, struct pf_pdesc *); VNET_DECLARE(struct pf_kstatus, pf_status); #define V_pf_status VNET(pf_status) struct pf_limit { uma_zone_t zone; u_int limit; }; VNET_DECLARE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define V_pf_limits VNET(pf_limits) #endif /* _KERNEL */ #ifdef _KERNEL VNET_DECLARE(struct pf_kanchor_global, pf_anchors); #define V_pf_anchors VNET(pf_anchors) VNET_DECLARE(struct pf_kanchor, pf_main_anchor); #define V_pf_main_anchor VNET(pf_main_anchor) VNET_DECLARE(struct pf_keth_anchor_global, pf_keth_anchors); #define V_pf_keth_anchors VNET(pf_keth_anchors) #define pf_main_ruleset V_pf_main_anchor.ruleset VNET_DECLARE(struct pf_keth_anchor, pf_main_keth_anchor); #define V_pf_main_keth_anchor VNET(pf_main_keth_anchor) VNET_DECLARE(struct pf_keth_ruleset*, pf_keth); #define V_pf_keth VNET(pf_keth) void pf_init_kruleset(struct pf_kruleset *); void pf_init_keth(struct pf_keth_ruleset *); int pf_kanchor_setup(struct pf_krule *, const struct pf_kruleset *, const char *); int pf_kanchor_nvcopyout(const struct pf_kruleset *, const struct pf_krule *, nvlist_t *); int pf_kanchor_copyout(const struct pf_kruleset *, const struct pf_krule *, struct pfioc_rule *); void pf_kanchor_remove(struct pf_krule *); void pf_remove_if_empty_kruleset(struct pf_kruleset *); struct pf_kruleset *pf_find_kruleset(const char *); struct pf_kruleset *pf_find_or_create_kruleset(const char *); void pf_rs_initialize(void); struct pf_krule *pf_krule_alloc(void); void pf_remove_if_empty_keth_ruleset( struct pf_keth_ruleset *); struct pf_keth_ruleset *pf_find_keth_ruleset(const char *); struct pf_keth_anchor *pf_find_keth_anchor(const char *); int pf_keth_anchor_setup(struct pf_keth_rule *, const struct pf_keth_ruleset *, const char *); int pf_keth_anchor_nvcopyout( const struct pf_keth_ruleset *, const struct pf_keth_rule *, nvlist_t *); struct pf_keth_ruleset *pf_find_or_create_keth_ruleset(const char *); void pf_keth_anchor_remove(struct pf_keth_rule *); void pf_krule_free(struct pf_krule *); #endif /* The fingerprint functions can be linked into userland programs (tcpdump) */ int pf_osfp_add(struct pf_osfp_ioctl *); #ifdef _KERNEL struct pf_osfp_enlist * pf_osfp_fingerprint(struct pf_pdesc *, struct mbuf *, int, const struct tcphdr *); #endif /* _KERNEL */ void pf_osfp_flush(void); int pf_osfp_get(struct pf_osfp_ioctl *); int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t); #ifdef _KERNEL void pf_print_host(struct pf_addr *, u_int16_t, sa_family_t); void pf_step_into_anchor(struct pf_kanchor_stackframe *, int *, struct pf_kruleset **, int, struct pf_krule **, struct pf_krule **, int *); int pf_step_out_of_anchor(struct pf_kanchor_stackframe *, int *, struct pf_kruleset **, int, struct pf_krule **, struct pf_krule **, int *); void pf_step_into_keth_anchor(struct pf_keth_anchor_stackframe *, int *, struct pf_keth_ruleset **, struct pf_keth_rule **, struct pf_keth_rule **, int *); int pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *, int *, struct pf_keth_ruleset **, struct pf_keth_rule **, struct pf_keth_rule **, int *); u_short pf_map_addr(u_int8_t, struct pf_krule *, struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_ksrc_node **); struct pf_krule *pf_get_translation(struct pf_pdesc *, struct mbuf *, int, struct pfi_kkif *, struct pf_ksrc_node **, struct pf_state_key **, struct pf_state_key **, struct pf_addr *, struct pf_addr *, uint16_t, uint16_t, struct pf_kanchor_stackframe *); struct pf_state_key *pf_state_key_setup(struct pf_pdesc *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t); struct pf_state_key *pf_state_key_clone(struct pf_state_key *); void pf_rule_to_actions(struct pf_krule *, struct pf_rule_actions *); int pf_normalize_mss(struct mbuf *m, int off, struct pf_pdesc *pd); #ifdef INET void pf_scrub_ip(struct mbuf **, struct pf_pdesc *); #endif /* INET */ #ifdef INET6 void pf_scrub_ip6(struct mbuf **, struct pf_pdesc *); #endif /* INET6 */ struct pfi_kkif *pf_kkif_create(int); void pf_kkif_free(struct pfi_kkif *); void pf_kkif_zero(struct pfi_kkif *); #endif /* _KERNEL */ #endif /* _NET_PFVAR_H_ */ diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 490a4130f025..a721385f761f 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -1,8117 +1,8252 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002 - 2008 Henning Brauer * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* dummynet */ #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif /* INET6 */ #include #include #include #include #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x SDT_PROVIDER_DEFINE(pf); SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *", "struct pf_kstate *"); SDT_PROBE_DEFINE4(pf, ip, test6, done, "int", "int", "struct pf_krule *", "struct pf_kstate *"); SDT_PROBE_DEFINE5(pf, ip, state, lookup, "struct pfi_kkif *", "struct pf_state_key_cmp *", "int", "struct pf_pdesc *", "struct pf_kstate *"); SDT_PROBE_DEFINE3(pf, eth, test_rule, entry, "int", "struct ifnet *", "struct mbuf *"); SDT_PROBE_DEFINE2(pf, eth, test_rule, test, "int", "struct pf_keth_rule *"); SDT_PROBE_DEFINE3(pf, eth, test_rule, mismatch, "int", "struct pf_keth_rule *", "char *"); SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *"); SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match, "int", "struct pf_keth_rule *"); /* * Global variables */ /* state tables */ VNET_DEFINE(struct pf_altqqueue, pf_altqs[4]); VNET_DEFINE(struct pf_kpalist, pf_pabuf); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_inactive); VNET_DEFINE(struct pf_kstatus, pf_status); VNET_DEFINE(u_int32_t, ticket_altqs_active); VNET_DEFINE(u_int32_t, ticket_altqs_inactive); VNET_DEFINE(int, altqs_inactive_open); VNET_DEFINE(u_int32_t, ticket_pabuf); VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); #define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) VNET_DEFINE(u_char, pf_tcp_secret[16]); #define V_pf_tcp_secret VNET(pf_tcp_secret) VNET_DEFINE(int, pf_tcp_secret_init); #define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) VNET_DEFINE(int, pf_tcp_iss_off); #define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) VNET_DECLARE(int, pf_vnet_active); #define V_pf_vnet_active VNET(pf_vnet_active) VNET_DEFINE_STATIC(uint32_t, pf_purge_idx); #define V_pf_purge_idx VNET(pf_purge_idx) #ifdef PF_WANT_32_TO_64_COUNTER VNET_DEFINE_STATIC(uint32_t, pf_counter_periodic_iter); #define V_pf_counter_periodic_iter VNET(pf_counter_periodic_iter) VNET_DEFINE(struct allrulelist_head, pf_allrulelist); VNET_DEFINE(size_t, pf_allrulecount); VNET_DEFINE(struct pf_krule *, pf_rulemarker); #endif /* * Queue for pf_intr() sends. */ static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); struct pf_send_entry { STAILQ_ENTRY(pf_send_entry) pfse_next; struct mbuf *pfse_m; enum { PFSE_IP, PFSE_IP6, PFSE_ICMP, PFSE_ICMP6, } pfse_type; struct { int type; int code; int mtu; } icmpopts; }; STAILQ_HEAD(pf_send_head, pf_send_entry); VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue); #define V_pf_sendqueue VNET(pf_sendqueue) static struct mtx_padalign pf_sendqueue_mtx; MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF); #define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) #define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) /* * Queue for pf_overload_task() tasks. */ struct pf_overload_entry { SLIST_ENTRY(pf_overload_entry) next; struct pf_addr addr; sa_family_t af; uint8_t dir; struct pf_krule *rule; }; SLIST_HEAD(pf_overload_head, pf_overload_entry); VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue); #define V_pf_overloadqueue VNET(pf_overloadqueue) VNET_DEFINE_STATIC(struct task, pf_overloadtask); #define V_pf_overloadtask VNET(pf_overloadtask) static struct mtx_padalign pf_overloadqueue_mtx; MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx, "pf overload/flush queue", MTX_DEF); #define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx) #define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx) VNET_DEFINE(struct pf_krulequeue, pf_unlinked_rules); struct mtx_padalign pf_unlnkdrules_mtx; MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules", MTX_DEF); struct sx pf_config_lock; SX_SYSINIT(pf_config_lock, &pf_config_lock, "pf config"); struct mtx_padalign pf_table_stats_lock; MTX_SYSINIT(pf_table_stats_lock, &pf_table_stats_lock, "pf table stats", MTX_DEF); VNET_DEFINE_STATIC(uma_zone_t, pf_sources_z); #define V_pf_sources_z VNET(pf_sources_z) uma_zone_t pf_mtag_z; VNET_DEFINE(uma_zone_t, pf_state_z); VNET_DEFINE(uma_zone_t, pf_state_key_z); VNET_DEFINE(struct unrhdr64, pf_stateid); static void pf_src_tree_remove_state(struct pf_kstate *); static void pf_init_threshold(struct pf_threshold *, u_int32_t, u_int32_t); static void pf_add_threshold(struct pf_threshold *); static int pf_check_threshold(struct pf_threshold *); static void pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *, u_int16_t *, u_int16_t *, struct pf_addr *, u_int16_t, u_int8_t, sa_family_t); static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *); static void pf_change_icmp(struct pf_addr *, u_int16_t *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t *, u_int16_t *, u_int16_t *, u_int16_t *, u_int8_t, sa_family_t); static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_krule *, int); static void pf_detach_state(struct pf_kstate *); static int pf_state_key_attach(struct pf_state_key *, struct pf_state_key *, struct pf_kstate *); static void pf_state_key_detach(struct pf_kstate *, int); static int pf_state_key_ctor(void *, int, void *, int); static u_int32_t pf_tcp_iss(struct pf_pdesc *); static int pf_dummynet(struct pf_pdesc *, struct pf_kstate *, struct pf_krule *, struct mbuf **); static int pf_dummynet_route(struct pf_pdesc *, struct pf_kstate *, struct pf_krule *, struct ifnet *, struct sockaddr *, struct mbuf **); static int pf_test_eth_rule(int, struct pfi_kkif *, struct mbuf **); static int pf_test_rule(struct pf_krule **, struct pf_kstate **, struct pfi_kkif *, struct mbuf *, int, struct pf_pdesc *, struct pf_krule **, struct pf_kruleset **, struct inpcb *); static int pf_create_state(struct pf_krule *, struct pf_krule *, struct pf_krule *, struct pf_pdesc *, struct pf_ksrc_node *, struct pf_state_key *, struct pf_state_key *, struct mbuf *, int, u_int16_t, u_int16_t, int *, struct pfi_kkif *, struct pf_kstate **, int, u_int16_t, u_int16_t, int, struct pf_krule_slist *); static int pf_test_fragment(struct pf_krule **, struct pfi_kkif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_krule **, struct pf_kruleset **); static int pf_tcp_track_full(struct pf_kstate **, struct pfi_kkif *, struct mbuf *, int, struct pf_pdesc *, u_short *, int *); static int pf_tcp_track_sloppy(struct pf_kstate **, struct pf_pdesc *, u_short *); static int pf_test_state_tcp(struct pf_kstate **, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_udp(struct pf_kstate **, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *); static int pf_test_state_icmp(struct pf_kstate **, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); +static int pf_test_state_sctp(struct pf_kstate **, + struct pfi_kkif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); static int pf_test_state_other(struct pf_kstate **, struct pfi_kkif *, struct mbuf *, struct pf_pdesc *); static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, int, u_int16_t); static int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); static void pf_print_state_parts(struct pf_kstate *, struct pf_state_key *, struct pf_state_key *); static void pf_patch_8(struct mbuf *, u_int16_t *, u_int8_t *, u_int8_t, bool, u_int8_t); static struct pf_kstate *pf_find_state(struct pfi_kkif *, struct pf_state_key_cmp *, u_int); static int pf_src_connlimit(struct pf_kstate **); static void pf_overload_task(void *v, int pending); static u_short pf_insert_src_node(struct pf_ksrc_node **, struct pf_krule *, struct pf_addr *, sa_family_t); static u_int pf_purge_expired_states(u_int, int); static void pf_purge_unlinked_rules(void); static int pf_mtag_uminit(void *, int, int); static void pf_mtag_free(struct m_tag *); static void pf_packet_rework_nat(struct mbuf *, struct pf_pdesc *, int, struct pf_state_key *); #ifdef INET static void pf_route(struct mbuf **, struct pf_krule *, struct ifnet *, struct pf_kstate *, struct pf_pdesc *, struct inpcb *); #endif /* INET */ #ifdef INET6 static void pf_change_a6(struct pf_addr *, u_int16_t *, struct pf_addr *, u_int8_t); static void pf_route6(struct mbuf **, struct pf_krule *, struct ifnet *, struct pf_kstate *, struct pf_pdesc *, struct inpcb *); #endif /* INET6 */ static __inline void pf_set_protostate(struct pf_kstate *, int, u_int8_t); int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); extern int pf_end_threads; extern struct proc *pf_purge_proc; VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define PACKET_UNDO_NAT(_m, _pd, _off, _s) \ do { \ struct pf_state_key *nk; \ if ((pd->dir) == PF_OUT) \ nk = (_s)->key[PF_SK_STACK]; \ else \ nk = (_s)->key[PF_SK_WIRE]; \ pf_packet_rework_nat(_m, _pd, _off, nk); \ } while (0) #define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ (pd)->pf_mtag->flags & PF_MTAG_FLAG_PACKET_LOOPED) #define STATE_LOOKUP(i, k, s, pd) \ do { \ (s) = pf_find_state((i), (k), (pd->dir)); \ SDT_PROBE5(pf, ip, state, lookup, i, k, (pd->dir), pd, (s)); \ if ((s) == NULL) \ return (PF_DROP); \ if (PACKET_LOOPED(pd)) \ return (PF_PASS); \ } while (0) #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all #define STATE_INC_COUNTERS(s) \ do { \ struct pf_krule_item *mrm; \ counter_u64_add(s->rule.ptr->states_cur, 1); \ counter_u64_add(s->rule.ptr->states_tot, 1); \ if (s->anchor.ptr != NULL) { \ counter_u64_add(s->anchor.ptr->states_cur, 1); \ counter_u64_add(s->anchor.ptr->states_tot, 1); \ } \ if (s->nat_rule.ptr != NULL) { \ counter_u64_add(s->nat_rule.ptr->states_cur, 1);\ counter_u64_add(s->nat_rule.ptr->states_tot, 1);\ } \ SLIST_FOREACH(mrm, &s->match_rules, entry) { \ counter_u64_add(mrm->r->states_cur, 1); \ counter_u64_add(mrm->r->states_tot, 1); \ } \ } while (0) #define STATE_DEC_COUNTERS(s) \ do { \ struct pf_krule_item *mrm; \ if (s->nat_rule.ptr != NULL) \ counter_u64_add(s->nat_rule.ptr->states_cur, -1);\ if (s->anchor.ptr != NULL) \ counter_u64_add(s->anchor.ptr->states_cur, -1); \ counter_u64_add(s->rule.ptr->states_cur, -1); \ SLIST_FOREACH(mrm, &s->match_rules, entry) \ counter_u64_add(mrm->r->states_cur, -1); \ } while (0) MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); MALLOC_DEFINE(M_PF_RULE_ITEM, "pf_krule_item", "pf(4) rule items"); VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(struct pf_srchash *, pf_srchash); SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "pf(4)"); u_long pf_hashmask; u_long pf_srchashmask; static u_long pf_hashsize; static u_long pf_srchashsize; u_long pf_ioctl_maxcount = 65535; SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, &pf_hashsize, 0, "Size of pf(4) states hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN, &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call"); VNET_DEFINE(void *, pf_swi_cookie); VNET_DEFINE(struct intr_event *, pf_swi_ie); VNET_DEFINE(uint32_t, pf_hashseed); #define V_pf_hashseed VNET(pf_hashseed) static void pf_sctp_checksum(struct mbuf *m, int off) { uint32_t sum = 0; /* Zero out the checksum, to enable recalculation. */ m_copyback(m, off + offsetof(struct sctphdr, checksum), sizeof(sum), (caddr_t)&sum); sum = sctp_calculate_cksum(m, off); m_copyback(m, off + offsetof(struct sctphdr, checksum), sizeof(sum), (caddr_t)&sum); } int pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->addr32[3] > b->addr32[3]) return (1); if (a->addr32[3] < b->addr32[3]) return (-1); if (a->addr32[2] > b->addr32[2]) return (1); if (a->addr32[2] < b->addr32[2]) return (-1); if (a->addr32[1] > b->addr32[1]) return (1); if (a->addr32[1] < b->addr32[1]) return (-1); if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET6 */ default: panic("%s: unknown address family %u", __func__, af); } return (0); } static void pf_packet_rework_nat(struct mbuf *m, struct pf_pdesc *pd, int off, struct pf_state_key *nk) { switch (pd->proto) { case IPPROTO_TCP: { struct tcphdr *th = &pd->hdr.tcp; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, pd->af); m_copyback(m, off, sizeof(*th), (caddr_t)th); break; } case IPPROTO_UDP: { struct udphdr *uh = &pd->hdr.udp; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); break; } case IPPROTO_ICMP: { struct icmp *ih = &pd->hdr.icmp; if (nk->port[pd->sidx] != ih->icmp_id) { pd->hdr.icmp.icmp_cksum = pf_cksum_fixup( ih->icmp_cksum, ih->icmp_id, nk->port[pd->sidx], 0); ih->icmp_id = nk->port[pd->sidx]; pd->sport = &ih->icmp_id; m_copyback(m, off, ICMP_MINLEN, (caddr_t)ih); } /* FALLTHROUGH */ } default: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) { switch (pd->af) { case AF_INET: pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); break; case AF_INET6: PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); break; } } if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) { switch (pd->af) { case AF_INET: pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; case AF_INET6: PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); break; } } break; } } static __inline uint32_t pf_hashkey(struct pf_state_key *sk) { uint32_t h; h = murmur3_32_hash32((uint32_t *)sk, sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), V_pf_hashseed); return (h & pf_hashmask); } static __inline uint32_t pf_hashsrc(struct pf_addr *addr, sa_family_t af) { uint32_t h; switch (af) { case AF_INET: h = murmur3_32_hash32((uint32_t *)&addr->v4, sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed); break; case AF_INET6: h = murmur3_32_hash32((uint32_t *)&addr->v6, sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed); break; default: panic("%s: unknown address family %u", __func__, af); } return (h & pf_srchashmask); } #ifdef ALTQ static int pf_state_hash(struct pf_kstate *s) { u_int32_t hv = (intptr_t)s / sizeof(*s); hv ^= crc32(&s->src, sizeof(s->src)); hv ^= crc32(&s->dst, sizeof(s->dst)); if (hv == 0) hv = 1; return (hv); } #endif static __inline void pf_set_protostate(struct pf_kstate *s, int which, u_int8_t newstate) { if (which == PF_PEER_DST || which == PF_PEER_BOTH) s->dst.state = newstate; if (which == PF_PEER_DST) return; if (s->src.state == newstate) return; if (s->creatorid == V_pf_status.hostid && s->key[PF_SK_STACK] != NULL && s->key[PF_SK_STACK]->proto == IPPROTO_TCP && !(TCPS_HAVEESTABLISHED(s->src.state) || s->src.state == TCPS_CLOSED) && (TCPS_HAVEESTABLISHED(newstate) || newstate == TCPS_CLOSED)) atomic_add_32(&V_pf_status.states_halfopen, -1); s->src.state = newstate; } #ifdef INET6 void pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: dst->addr32[0] = src->addr32[0]; break; #endif /* INET */ case AF_INET6: dst->addr32[0] = src->addr32[0]; dst->addr32[1] = src->addr32[1]; dst->addr32[2] = src->addr32[2]; dst->addr32[3] = src->addr32[3]; break; } } #endif /* INET6 */ static void pf_init_threshold(struct pf_threshold *threshold, u_int32_t limit, u_int32_t seconds) { threshold->limit = limit * PF_THRESHOLD_MULT; threshold->seconds = seconds; threshold->count = 0; threshold->last = time_uptime; } static void pf_add_threshold(struct pf_threshold *threshold) { u_int32_t t = time_uptime, diff = t - threshold->last; if (diff >= threshold->seconds) threshold->count = 0; else threshold->count -= threshold->count * diff / threshold->seconds; threshold->count += PF_THRESHOLD_MULT; threshold->last = t; } static int pf_check_threshold(struct pf_threshold *threshold) { return (threshold->count > threshold->limit); } static int pf_src_connlimit(struct pf_kstate **state) { struct pf_overload_entry *pfoe; int bad = 0; PF_STATE_LOCK_ASSERT(*state); /* * XXXKS: The src node is accessed unlocked! * PF_SRC_NODE_LOCK_ASSERT((*state)->src_node); */ (*state)->src_node->conn++; (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); if ((*state)->rule.ptr->max_src_conn && (*state)->rule.ptr->max_src_conn < (*state)->src_node->conn) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1); bad++; } if ((*state)->rule.ptr->max_src_conn_rate.limit && pf_check_threshold(&(*state)->src_node->conn_rate)) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1); bad++; } if (!bad) return (0); /* Kill this state. */ (*state)->timeout = PFTM_PURGE; pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED); if ((*state)->rule.ptr->overload_tbl == NULL) return (1); /* Schedule overloading and flushing task. */ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); if (pfoe == NULL) return (1); /* too bad :( */ bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); pfoe->af = (*state)->key[PF_SK_WIRE]->af; pfoe->rule = (*state)->rule.ptr; pfoe->dir = (*state)->direction; PF_OVERLOADQ_LOCK(); SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); PF_OVERLOADQ_UNLOCK(); taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); return (1); } static void pf_overload_task(void *v, int pending) { struct pf_overload_head queue; struct pfr_addr p; struct pf_overload_entry *pfoe, *pfoe1; uint32_t killed = 0; CURVNET_SET((struct vnet *)v); PF_OVERLOADQ_LOCK(); queue = V_pf_overloadqueue; SLIST_INIT(&V_pf_overloadqueue); PF_OVERLOADQ_UNLOCK(); bzero(&p, sizeof(p)); SLIST_FOREACH(pfoe, &queue, next) { counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("%s: blocking address ", __func__); pf_print_host(&pfoe->addr, 0, pfoe->af); printf("\n"); } p.pfra_af = pfoe->af; switch (pfoe->af) { #ifdef INET case AF_INET: p.pfra_net = 32; p.pfra_ip4addr = pfoe->addr.v4; break; #endif #ifdef INET6 case AF_INET6: p.pfra_net = 128; p.pfra_ip6addr = pfoe->addr.v6; break; #endif } PF_RULES_WLOCK(); pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second); PF_RULES_WUNLOCK(); } /* * Remove those entries, that don't need flushing. */ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) if (pfoe->rule->flush == 0) { SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next); free(pfoe, M_PFTEMP); } else counter_u64_add( V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1); /* If nothing to flush, return. */ if (SLIST_EMPTY(&queue)) { CURVNET_RESTORE(); return; } for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state_key *sk; struct pf_kstate *s; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { sk = s->key[PF_SK_WIRE]; SLIST_FOREACH(pfoe, &queue, next) if (sk->af == pfoe->af && ((pfoe->rule->flush & PF_FLUSH_GLOBAL) || pfoe->rule == s->rule.ptr) && ((pfoe->dir == PF_OUT && PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) || (pfoe->dir == PF_IN && PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) { s->timeout = PFTM_PURGE; pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED); killed++; } } PF_HASHROW_UNLOCK(ih); } SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) free(pfoe, M_PFTEMP); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: %u states killed", __func__, killed); CURVNET_RESTORE(); } /* * Can return locked on failure, so that we can consistently * allocate and insert a new one. */ struct pf_ksrc_node * pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af, struct pf_srchash **sh, bool returnlocked) { struct pf_ksrc_node *n; counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); *sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_LOCK(*sh); LIST_FOREACH(n, &(*sh)->nodes, entry) if (n->rule.ptr == rule && n->af == af && ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) break; if (n != NULL) { n->states++; PF_HASHROW_UNLOCK(*sh); } else if (returnlocked == false) PF_HASHROW_UNLOCK(*sh); return (n); } static void pf_free_src_node(struct pf_ksrc_node *sn) { for (int i = 0; i < 2; i++) { counter_u64_free(sn->bytes[i]); counter_u64_free(sn->packets[i]); } uma_zfree(V_pf_sources_z, sn); } static u_short pf_insert_src_node(struct pf_ksrc_node **sn, struct pf_krule *rule, struct pf_addr *src, sa_family_t af) { u_short reason = 0; struct pf_srchash *sh = NULL; KASSERT((rule->rule_flag & PFRULE_SRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR), ("%s for non-tracking rule %p", __func__, rule)); if (*sn == NULL) *sn = pf_find_src_node(src, rule, af, &sh, true); if (*sn == NULL) { PF_HASHROW_ASSERT(sh); if (rule->max_src_nodes && counter_u64_fetch(rule->src_nodes) >= rule->max_src_nodes) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1); PF_HASHROW_UNLOCK(sh); reason = PFRES_SRCLIMIT; goto done; } (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); if ((*sn) == NULL) { PF_HASHROW_UNLOCK(sh); reason = PFRES_MEMORY; goto done; } for (int i = 0; i < 2; i++) { (*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT); (*sn)->packets[i] = counter_u64_alloc(M_NOWAIT); if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) { pf_free_src_node(*sn); PF_HASHROW_UNLOCK(sh); reason = PFRES_MEMORY; goto done; } } pf_init_threshold(&(*sn)->conn_rate, rule->max_src_conn_rate.limit, rule->max_src_conn_rate.seconds); MPASS((*sn)->lock == NULL); (*sn)->lock = &sh->lock; (*sn)->af = af; (*sn)->rule.ptr = rule; PF_ACPY(&(*sn)->addr, src, af); LIST_INSERT_HEAD(&sh->nodes, *sn, entry); (*sn)->creation = time_uptime; (*sn)->ruletype = rule->action; (*sn)->states = 1; if ((*sn)->rule.ptr != NULL) counter_u64_add((*sn)->rule.ptr->src_nodes, 1); PF_HASHROW_UNLOCK(sh); counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); } else { if (rule->max_src_states && (*sn)->states >= rule->max_src_states) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], 1); reason = PFRES_SRCLIMIT; goto done; } } done: return (reason); } void pf_unlink_src_node(struct pf_ksrc_node *src) { PF_SRC_NODE_LOCK_ASSERT(src); LIST_REMOVE(src, entry); if (src->rule.ptr) counter_u64_add(src->rule.ptr->src_nodes, -1); } u_int pf_free_src_nodes(struct pf_ksrc_node_list *head) { struct pf_ksrc_node *sn, *tmp; u_int count = 0; LIST_FOREACH_SAFE(sn, head, entry, tmp) { pf_free_src_node(sn); count++; } counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count); return (count); } void pf_mtag_initialize(void) { pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL, UMA_ALIGN_PTR, 0); } /* Per-vnet data storage structures initialization. */ void pf_initialize(void) { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; u_int i; if (pf_hashsize == 0 || !powerof2(pf_hashsize)) pf_hashsize = PF_HASHSIZ; if (pf_srchashsize == 0 || !powerof2(pf_srchashsize)) pf_srchashsize = PF_SRCHASHSIZ; V_pf_hashseed = arc4random(); /* States and state keys storage. */ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_kstate), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); uma_zone_set_warning(V_pf_state_z, "PF states limit reached"); V_pf_state_key_z = uma_zcreate("pf state keys", sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_NOWAIT | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_keyhash == NULL || V_pf_idhash == NULL) { printf("pf: Unable to allocate memory for " "state_hashsize %lu.\n", pf_hashsize); free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); pf_hashsize = PF_HASHSIZ; V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO); } pf_hashmask = pf_hashsize - 1; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); } /* Source nodes. */ V_pf_sources_z = uma_zcreate("pf source nodes", sizeof(struct pf_ksrc_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached"); V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_srchash == NULL) { printf("pf: Unable to allocate memory for " "source_hashsize %lu.\n", pf_srchashsize); pf_srchashsize = PF_SRCHASHSIZ; V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO); } pf_srchashmask = pf_srchashsize - 1; for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); TAILQ_INIT(&V_pf_altqs[2]); TAILQ_INIT(&V_pf_altqs[3]); TAILQ_INIT(&V_pf_pabuf); V_pf_altqs_active = &V_pf_altqs[0]; V_pf_altq_ifs_active = &V_pf_altqs[1]; V_pf_altqs_inactive = &V_pf_altqs[2]; V_pf_altq_ifs_inactive = &V_pf_altqs[3]; /* Send & overload+flush queues. */ STAILQ_INIT(&V_pf_sendqueue); SLIST_INIT(&V_pf_overloadqueue); TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet); /* Unlinked, but may be referenced rules. */ TAILQ_INIT(&V_pf_unlinked_rules); } void pf_mtag_cleanup(void) { uma_zdestroy(pf_mtag_z); } void pf_cleanup(void) { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; struct pf_send_entry *pfse, *next; u_int i; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", __func__)); KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", __func__)); mtx_destroy(&kh->lock); mtx_destroy(&ih->lock); } free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { KASSERT(LIST_EMPTY(&sh->nodes), ("%s: source node hash not empty", __func__)); mtx_destroy(&sh->lock); } free(V_pf_srchash, M_PFHASH); STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { m_freem(pfse->pfse_m); free(pfse, M_PFTEMP); } uma_zdestroy(V_pf_sources_z); uma_zdestroy(V_pf_state_z); uma_zdestroy(V_pf_state_key_z); } static int pf_mtag_uminit(void *mem, int size, int how) { struct m_tag *t; t = (struct m_tag *)mem; t->m_tag_cookie = MTAG_ABI_COMPAT; t->m_tag_id = PACKET_TAG_PF; t->m_tag_len = sizeof(struct pf_mtag); t->m_tag_free = pf_mtag_free; return (0); } static void pf_mtag_free(struct m_tag *t) { uma_zfree(pf_mtag_z, t); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) return ((struct pf_mtag *)(mtag + 1)); mtag = uma_zalloc(pf_mtag_z, M_NOWAIT); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof(struct pf_mtag)); m_tag_prepend(m, mtag); return ((struct pf_mtag *)(mtag + 1)); } static int pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, struct pf_kstate *s) { struct pf_keyhash *khs, *khw, *kh; struct pf_state_key *sk, *cur; struct pf_kstate *si, *olds = NULL; int idx; KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); /* * We need to lock hash slots of both keys. To avoid deadlock * we always lock the slot with lower address first. Unlock order * isn't important. * * We also need to lock ID hash slot before dropping key * locks. On success we return with ID hash slot locked. */ if (skw == sks) { khs = khw = &V_pf_keyhash[pf_hashkey(skw)]; PF_HASHROW_LOCK(khs); } else { khs = &V_pf_keyhash[pf_hashkey(sks)]; khw = &V_pf_keyhash[pf_hashkey(skw)]; if (khs == khw) { PF_HASHROW_LOCK(khs); } else if (khs < khw) { PF_HASHROW_LOCK(khs); PF_HASHROW_LOCK(khw); } else { PF_HASHROW_LOCK(khw); PF_HASHROW_LOCK(khs); } } #define KEYS_UNLOCK() do { \ if (khs != khw) { \ PF_HASHROW_UNLOCK(khs); \ PF_HASHROW_UNLOCK(khw); \ } else \ PF_HASHROW_UNLOCK(khs); \ } while (0) /* * First run: start with wire key. */ sk = skw; kh = khw; idx = PF_SK_WIRE; MPASS(s->lock == NULL); s->lock = &V_pf_idhash[PF_IDHASH(s)].lock; keyattach: LIST_FOREACH(cur, &kh->keys, entry) if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) break; if (cur != NULL) { /* Key exists. Check for same kif, if none, add to key. */ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; PF_HASHROW_LOCK(ih); if (si->kif == s->kif && si->direction == s->direction) { if (sk->proto == IPPROTO_TCP && si->src.state >= TCPS_FIN_WAIT_2 && si->dst.state >= TCPS_FIN_WAIT_2) { /* * New state matches an old >FIN_WAIT_2 * state. We can't drop key hash locks, * thus we can't unlink it properly. * * As a workaround we drop it into * TCPS_CLOSED state, schedule purge * ASAP and push it into the very end * of the slot TAILQ, so that it won't * conflict with our new state. */ pf_set_protostate(si, PF_PEER_BOTH, TCPS_CLOSED); si->timeout = PFTM_PURGE; olds = si; } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: %s key attach " "failed on %s: ", (idx == PF_SK_WIRE) ? "wire" : "stack", s->kif->pfik_name); pf_print_state_parts(s, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf(", existing: "); pf_print_state_parts(si, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf("\n"); } PF_HASHROW_UNLOCK(ih); KEYS_UNLOCK(); uma_zfree(V_pf_state_key_z, sk); if (idx == PF_SK_STACK) pf_detach_state(s); return (EEXIST); /* collision! */ } } PF_HASHROW_UNLOCK(ih); } uma_zfree(V_pf_state_key_z, sk); s->key[idx] = cur; } else { LIST_INSERT_HEAD(&kh->keys, sk, entry); s->key[idx] = sk; } stateattach: /* List is sorted, if-bound states before floating. */ if (s->kif == V_pfi_all) TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); else TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); if (olds) { TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]); TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds, key_list[idx]); olds = NULL; } /* * Attach done. See how should we (or should not?) * attach a second key. */ if (sks == skw) { s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; idx = PF_SK_STACK; sks = NULL; goto stateattach; } else if (sks != NULL) { /* * Continue attaching with stack key. */ sk = sks; kh = khs; idx = PF_SK_STACK; sks = NULL; goto keyattach; } PF_STATE_LOCK(s); KEYS_UNLOCK(); KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, ("%s failure", __func__)); return (0); #undef KEYS_UNLOCK } static void pf_detach_state(struct pf_kstate *s) { struct pf_state_key *sks = s->key[PF_SK_STACK]; struct pf_keyhash *kh; if (sks != NULL) { kh = &V_pf_keyhash[pf_hashkey(sks)]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_STACK] != NULL) pf_state_key_detach(s, PF_SK_STACK); /* * If both point to same key, then we are done. */ if (sks == s->key[PF_SK_WIRE]) { pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); return; } PF_HASHROW_UNLOCK(kh); } if (s->key[PF_SK_WIRE] != NULL) { kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_WIRE] != NULL) pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); } } static void pf_state_key_detach(struct pf_kstate *s, int idx) { struct pf_state_key *sk = s->key[idx]; #ifdef INVARIANTS struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; PF_HASHROW_ASSERT(kh); #endif TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); s->key[idx] = NULL; if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { LIST_REMOVE(sk, entry); uma_zfree(V_pf_state_key_z, sk); } } static int pf_state_key_ctor(void *mem, int size, void *arg, int flags) { struct pf_state_key *sk = mem; bzero(sk, sizeof(struct pf_state_key_cmp)); TAILQ_INIT(&sk->states[PF_SK_WIRE]); TAILQ_INIT(&sk->states[PF_SK_STACK]); return (0); } struct pf_state_key * pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); sk->port[pd->sidx] = sport; sk->port[pd->didx] = dport; sk->proto = pd->proto; sk->af = pd->af; return (sk); } struct pf_state_key * pf_state_key_clone(struct pf_state_key *orig) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); return (sk); } int pf_state_insert(struct pfi_kkif *kif, struct pfi_kkif *orig_kif, struct pf_state_key *skw, struct pf_state_key *sks, struct pf_kstate *s) { struct pf_idhash *ih; struct pf_kstate *cur; int error; KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), ("%s: sks not pristine", __func__)); KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), ("%s: skw not pristine", __func__)); KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); s->kif = kif; s->orig_kif = orig_kif; if (s->id == 0 && s->creatorid == 0) { s->id = alloc_unr64(&V_pf_stateid); s->id = htobe64(s->id); s->creatorid = V_pf_status.hostid; } /* Returns with ID locked on success. */ if ((error = pf_state_key_attach(skw, sks, s)) != 0) return (error); ih = &V_pf_idhash[PF_IDHASH(s)]; PF_HASHROW_ASSERT(ih); LIST_FOREACH(cur, &ih->states, entry) if (cur->id == s->id && cur->creatorid == s->creatorid) break; if (cur != NULL) { PF_HASHROW_UNLOCK(ih); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state ID collision: " "id: %016llx creatorid: %08x\n", (unsigned long long)be64toh(s->id), ntohl(s->creatorid)); } pf_detach_state(s); return (EEXIST); } LIST_INSERT_HEAD(&ih->states, s, entry); /* One for keys, one for ID hash. */ refcount_init(&s->refs, 2); pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_INSERT], 1); if (V_pfsync_insert_state_ptr != NULL) V_pfsync_insert_state_ptr(s); /* Returns locked. */ return (0); } /* * Find state by ID: returns with locked row on success. */ struct pf_kstate * pf_find_state_byid(uint64_t id, uint32_t creatorid) { struct pf_idhash *ih; struct pf_kstate *s; pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))]; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) if (s->id == id && s->creatorid == creatorid) break; if (s == NULL) PF_HASHROW_UNLOCK(ih); return (s); } /* * Find state by key. * Returns with ID hash slot locked on success. */ static struct pf_kstate * pf_find_state(struct pfi_kkif *kif, struct pf_state_key_cmp *key, u_int dir) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_kstate *s; int idx; pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); /* List is sorted, if-bound states before floating ones. */ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) if (s->kif == V_pfi_all || s->kif == kif) { PF_STATE_LOCK(s); PF_HASHROW_UNLOCK(kh); if (__predict_false(s->timeout >= PFTM_MAX)) { /* * State is either being processed by * pf_unlink_state() in an other thread, or * is scheduled for immediate expiry. */ PF_STATE_UNLOCK(s); return (NULL); } return (s); } PF_HASHROW_UNLOCK(kh); return (NULL); } /* * Returns with ID hash slot locked on success. */ struct pf_kstate * pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_kstate *s, *ret = NULL; int idx, inout = 0; pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } switch (dir) { case PF_IN: idx = PF_SK_WIRE; break; case PF_OUT: idx = PF_SK_STACK; break; case PF_INOUT: idx = PF_SK_WIRE; inout = 1; break; default: panic("%s: dir %u", __func__, dir); } second_run: TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { if (more == NULL) { PF_STATE_LOCK(s); PF_HASHROW_UNLOCK(kh); return (s); } if (ret) (*more)++; else { ret = s; PF_STATE_LOCK(s); } } if (inout == 1) { inout = 0; idx = PF_SK_STACK; goto second_run; } PF_HASHROW_UNLOCK(kh); return (ret); } /* * FIXME * This routine is inefficient -- locks the state only to unlock immediately on * return. * It is racy -- after the state is unlocked nothing stops other threads from * removing it. */ bool pf_find_state_all_exists(struct pf_state_key_cmp *key, u_int dir) { struct pf_kstate *s; s = pf_find_state_all(key, dir, NULL); if (s != NULL) { PF_STATE_UNLOCK(s); return (true); } return (false); } /* END state table stuff */ static void pf_send(struct pf_send_entry *pfse) { PF_SENDQ_LOCK(); STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); PF_SENDQ_UNLOCK(); swi_sched(V_pf_swi_cookie, 0); } static bool pf_isforlocal(struct mbuf *m, int af) { switch (af) { #ifdef INET case AF_INET: { struct ip *ip = mtod(m, struct ip *); return (in_localip(ip->ip_dst)); } #endif #ifdef INET6 case AF_INET6: { struct ip6_hdr *ip6; struct in6_ifaddr *ia; ip6 = mtod(m, struct ip6_hdr *); ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia == NULL) return (false); return (! (ia->ia6_flags & IN6_IFF_NOTREADY)); } #endif default: panic("Unsupported af %d", af); } return (false); } void pf_intr(void *v) { struct epoch_tracker et; struct pf_send_head queue; struct pf_send_entry *pfse, *next; CURVNET_SET((struct vnet *)v); PF_SENDQ_LOCK(); queue = V_pf_sendqueue; STAILQ_INIT(&V_pf_sendqueue); PF_SENDQ_UNLOCK(); NET_EPOCH_ENTER(et); STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { switch (pfse->pfse_type) { #ifdef INET case PFSE_IP: { if (pf_isforlocal(pfse->pfse_m, AF_INET)) { pfse->pfse_m->m_flags |= M_SKIP_FIREWALL; pfse->pfse_m->m_pkthdr.csum_flags |= CSUM_IP_VALID | CSUM_IP_CHECKED; ip_input(pfse->pfse_m); } else { ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); } break; } case PFSE_ICMP: icmp_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, 0, pfse->icmpopts.mtu); break; #endif /* INET */ #ifdef INET6 case PFSE_IP6: if (pf_isforlocal(pfse->pfse_m, AF_INET6)) { pfse->pfse_m->m_flags |= M_SKIP_FIREWALL; ip6_input(pfse->pfse_m); } else { ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, NULL); } break; case PFSE_ICMP6: icmp6_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, pfse->icmpopts.mtu); break; #endif /* INET6 */ default: panic("%s: unknown type", __func__); } free(pfse, M_PFTEMP); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } #define pf_purge_thread_period (hz / 10) #ifdef PF_WANT_32_TO_64_COUNTER static void pf_status_counter_u64_periodic(void) { PF_RULES_RASSERT(); if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 60)) != 0) { return; } for (int i = 0; i < FCNT_MAX; i++) { pf_counter_u64_periodic(&V_pf_status.fcounters[i]); } } static void pf_kif_counter_u64_periodic(void) { struct pfi_kkif *kif; size_t r, run; PF_RULES_RASSERT(); if (__predict_false(V_pf_allkifcount == 0)) { return; } if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) { return; } run = V_pf_allkifcount / 10; if (run < 5) run = 5; for (r = 0; r < run; r++) { kif = LIST_NEXT(V_pf_kifmarker, pfik_allkiflist); if (kif == NULL) { LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist); LIST_INSERT_HEAD(&V_pf_allkiflist, V_pf_kifmarker, pfik_allkiflist); break; } LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist); LIST_INSERT_AFTER(kif, V_pf_kifmarker, pfik_allkiflist); for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { for (int k = 0; k < 2; k++) { pf_counter_u64_periodic(&kif->pfik_packets[i][j][k]); pf_counter_u64_periodic(&kif->pfik_bytes[i][j][k]); } } } } } static void pf_rule_counter_u64_periodic(void) { struct pf_krule *rule; size_t r, run; PF_RULES_RASSERT(); if (__predict_false(V_pf_allrulecount == 0)) { return; } if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) { return; } run = V_pf_allrulecount / 10; if (run < 5) run = 5; for (r = 0; r < run; r++) { rule = LIST_NEXT(V_pf_rulemarker, allrulelist); if (rule == NULL) { LIST_REMOVE(V_pf_rulemarker, allrulelist); LIST_INSERT_HEAD(&V_pf_allrulelist, V_pf_rulemarker, allrulelist); break; } LIST_REMOVE(V_pf_rulemarker, allrulelist); LIST_INSERT_AFTER(rule, V_pf_rulemarker, allrulelist); pf_counter_u64_periodic(&rule->evaluations); for (int i = 0; i < 2; i++) { pf_counter_u64_periodic(&rule->packets[i]); pf_counter_u64_periodic(&rule->bytes[i]); } } } static void pf_counter_u64_periodic_main(void) { PF_RULES_RLOCK_TRACKER; V_pf_counter_periodic_iter++; PF_RULES_RLOCK(); pf_counter_u64_critical_enter(); pf_status_counter_u64_periodic(); pf_kif_counter_u64_periodic(); pf_rule_counter_u64_periodic(); pf_counter_u64_critical_exit(); PF_RULES_RUNLOCK(); } #else #define pf_counter_u64_periodic_main() do { } while (0) #endif void pf_purge_thread(void *unused __unused) { VNET_ITERATOR_DECL(vnet_iter); sx_xlock(&pf_end_lock); while (pf_end_threads == 0) { sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", pf_purge_thread_period); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* Wait until V_pf_default_rule is initialized. */ if (V_pf_vnet_active == 0) { CURVNET_RESTORE(); continue; } pf_counter_u64_periodic_main(); /* * Process 1/interval fraction of the state * table every run. */ V_pf_purge_idx = pf_purge_expired_states(V_pf_purge_idx, pf_hashmask / (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); /* * Purge other expired types every * PFTM_INTERVAL seconds. */ if (V_pf_purge_idx == 0) { /* * Order is important: * - states and src nodes reference rules * - states and rules reference kifs */ pf_purge_expired_fragments(); pf_purge_expired_src_nodes(); pf_purge_unlinked_rules(); pfi_kkif_purge(); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } pf_end_threads++; sx_xunlock(&pf_end_lock); kproc_exit(0); } void pf_unload_vnet_purge(void) { /* * To cleanse up all kifs and rules we need * two runs: first one clears reference flags, * then pf_purge_expired_states() doesn't * raise them, and then second run frees. */ pf_purge_unlinked_rules(); pfi_kkif_purge(); /* * Now purge everything. */ pf_purge_expired_states(0, pf_hashmask); pf_purge_fragments(UINT_MAX); pf_purge_expired_src_nodes(); /* * Now all kifs & rules should be unreferenced, * thus should be successfully freed. */ pf_purge_unlinked_rules(); pfi_kkif_purge(); } u_int32_t pf_state_expires(const struct pf_kstate *state) { u_int32_t timeout; u_int32_t start; u_int32_t end; u_int32_t states; /* handle all PFTM_* > PFTM_MAX here */ if (state->timeout == PFTM_PURGE) return (time_uptime); KASSERT(state->timeout != PFTM_UNLINKED, ("pf_state_expires: timeout == PFTM_UNLINKED")); KASSERT((state->timeout < PFTM_MAX), ("pf_state_expires: timeout > PFTM_MAX")); timeout = state->rule.ptr->timeout[state->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; if (start && state->rule.ptr != &V_pf_default_rule) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = counter_u64_fetch(state->rule.ptr->states_cur); } else { start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; states = V_pf_status.states; } if (end && states > start && start < end) { if (states < end) { timeout = (u_int64_t)timeout * (end - states) / (end - start); return (state->expire + timeout); } else return (time_uptime); } return (state->expire + timeout); } void pf_purge_expired_src_nodes(void) { struct pf_ksrc_node_list freelist; struct pf_srchash *sh; struct pf_ksrc_node *cur, *next; int i; LIST_INIT(&freelist); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { PF_HASHROW_LOCK(sh); LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) if (cur->states == 0 && cur->expire <= time_uptime) { pf_unlink_src_node(cur); LIST_INSERT_HEAD(&freelist, cur, entry); } else if (cur->rule.ptr != NULL) cur->rule.ptr->rule_ref |= PFRULE_REFS; PF_HASHROW_UNLOCK(sh); } pf_free_src_nodes(&freelist); V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z); } static void pf_src_tree_remove_state(struct pf_kstate *s) { struct pf_ksrc_node *sn; uint32_t timeout; timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ? s->rule.ptr->timeout[PFTM_SRC_NODE] : V_pf_default_rule.timeout[PFTM_SRC_NODE]; if (s->src_node != NULL) { sn = s->src_node; PF_SRC_NODE_LOCK(sn); if (s->src.tcp_est) --sn->conn; if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_SRC_NODE_UNLOCK(sn); } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { sn = s->nat_src_node; PF_SRC_NODE_LOCK(sn); if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_SRC_NODE_UNLOCK(sn); } s->src_node = s->nat_src_node = NULL; } /* * Unlink and potentilly free a state. Function may be * called with ID hash row locked, but always returns * unlocked, since it needs to go through key hash locking. */ int pf_unlink_state(struct pf_kstate *s) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; PF_HASHROW_ASSERT(ih); if (s->timeout == PFTM_UNLINKED) { /* * State is being processed * by pf_unlink_state() in * an other thread. */ PF_HASHROW_UNLOCK(ih); return (0); /* XXXGL: undefined actually */ } if (s->src.state == PF_TCPS_PROXY_DST) { /* XXX wire key the right one? */ pf_send_tcp(s->rule.ptr, s->key[PF_SK_WIRE]->af, &s->key[PF_SK_WIRE]->addr[1], &s->key[PF_SK_WIRE]->addr[0], s->key[PF_SK_WIRE]->port[1], s->key[PF_SK_WIRE]->port[0], s->src.seqhi, s->src.seqlo + 1, TH_RST|TH_ACK, 0, 0, 0, true, s->tag, 0, s->act.rtableid); } LIST_REMOVE(s, entry); pf_src_tree_remove_state(s); if (V_pfsync_delete_state_ptr != NULL) V_pfsync_delete_state_ptr(s); STATE_DEC_COUNTERS(s); s->timeout = PFTM_UNLINKED; /* Ensure we remove it from the list of halfopen states, if needed. */ if (s->key[PF_SK_STACK] != NULL && s->key[PF_SK_STACK]->proto == IPPROTO_TCP) pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED); PF_HASHROW_UNLOCK(ih); pf_detach_state(s); /* pf_state_insert() initialises refs to 2 */ return (pf_release_staten(s, 2)); } struct pf_kstate * pf_alloc_state(int flags) { return (uma_zalloc(V_pf_state_z, flags | M_ZERO)); } void pf_free_state(struct pf_kstate *cur) { struct pf_krule_item *ri; KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, cur->timeout)); while ((ri = SLIST_FIRST(&cur->match_rules))) { SLIST_REMOVE_HEAD(&cur->match_rules, entry); free(ri, M_PF_RULE_ITEM); } pf_normalize_tcp_cleanup(cur); uma_zfree(V_pf_state_z, cur); pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1); } /* * Called only from pf_purge_thread(), thus serialized. */ static u_int pf_purge_expired_states(u_int i, int maxcheck) { struct pf_idhash *ih; struct pf_kstate *s; struct pf_krule_item *mrm; V_pf_status.states = uma_zone_get_cur(V_pf_state_z); /* * Go through hash and unlink states that expire now. */ while (maxcheck > 0) { ih = &V_pf_idhash[i]; /* only take the lock if we expect to do work */ if (!LIST_EMPTY(&ih->states)) { relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (pf_state_expires(s) <= time_uptime) { V_pf_status.states -= pf_unlink_state(s); goto relock; } s->rule.ptr->rule_ref |= PFRULE_REFS; if (s->nat_rule.ptr != NULL) s->nat_rule.ptr->rule_ref |= PFRULE_REFS; if (s->anchor.ptr != NULL) s->anchor.ptr->rule_ref |= PFRULE_REFS; s->kif->pfik_flags |= PFI_IFLAG_REFS; SLIST_FOREACH(mrm, &s->match_rules, entry) mrm->r->rule_ref |= PFRULE_REFS; if (s->rt_kif) s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } PF_HASHROW_UNLOCK(ih); } /* Return when we hit end of hash. */ if (++i > pf_hashmask) { V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (0); } maxcheck--; } V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (i); } static void pf_purge_unlinked_rules(void) { struct pf_krulequeue tmpq; struct pf_krule *r, *r1; /* * If we have overloading task pending, then we'd * better skip purging this time. There is a tiny * probability that overloading task references * an already unlinked rule. */ PF_OVERLOADQ_LOCK(); if (!SLIST_EMPTY(&V_pf_overloadqueue)) { PF_OVERLOADQ_UNLOCK(); return; } PF_OVERLOADQ_UNLOCK(); /* * Do naive mark-and-sweep garbage collecting of old rules. * Reference flag is raised by pf_purge_expired_states() * and pf_purge_expired_src_nodes(). * * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, * use a temporary queue. */ TAILQ_INIT(&tmpq); PF_UNLNKDRULES_LOCK(); TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { if (!(r->rule_ref & PFRULE_REFS)) { TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); TAILQ_INSERT_TAIL(&tmpq, r, entries); } else r->rule_ref &= ~PFRULE_REFS; } PF_UNLNKDRULES_UNLOCK(); if (!TAILQ_EMPTY(&tmpq)) { PF_CONFIG_LOCK(); PF_RULES_WLOCK(); TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { TAILQ_REMOVE(&tmpq, r, entries); pf_free_rule(r); } PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); } } void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { u_int32_t a = ntohl(addr->addr32[0]); printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, (a>>8)&255, a&255); if (p) { p = ntohs(p); printf(":%u", p); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { u_int16_t b; u_int8_t i, curstart, curend, maxstart, maxend; curstart = curend = maxstart = maxend = 255; for (i = 0; i < 8; i++) { if (!addr->addr16[i]) { if (curstart == 255) curstart = i; curend = i; } else { if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } curstart = curend = 255; } } if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } for (i = 0; i < 8; i++) { if (i >= maxstart && i <= maxend) { if (i == 0) printf(":"); if (i == maxend) printf(":"); } else { b = ntohs(addr->addr16[i]); printf("%x", b); if (i < 7) printf(":"); } } if (p) { p = ntohs(p); printf("[%u]", p); } break; } #endif /* INET6 */ } } void pf_print_state(struct pf_kstate *s) { pf_print_state_parts(s, NULL, NULL); } static void pf_print_state_parts(struct pf_kstate *s, struct pf_state_key *skwp, struct pf_state_key *sksp) { struct pf_state_key *skw, *sks; u_int8_t proto, dir; /* Do our best to fill these, but they're skipped if NULL */ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); proto = skw ? skw->proto : (sks ? sks->proto : 0); dir = s ? s->direction : 0; switch (proto) { case IPPROTO_IPV4: printf("IPv4"); break; case IPPROTO_IPV6: printf("IPv6"); break; case IPPROTO_TCP: printf("TCP"); break; case IPPROTO_UDP: printf("UDP"); break; case IPPROTO_ICMP: printf("ICMP"); break; case IPPROTO_ICMPV6: printf("ICMPv6"); break; default: printf("%u", proto); break; } switch (dir) { case PF_IN: printf(" in"); break; case PF_OUT: printf(" out"); break; } if (skw) { printf(" wire: "); pf_print_host(&skw->addr[0], skw->port[0], skw->af); printf(" "); pf_print_host(&skw->addr[1], skw->port[1], skw->af); } if (sks) { printf(" stack: "); if (sks != skw) { pf_print_host(&sks->addr[0], sks->port[0], sks->af); printf(" "); pf_print_host(&sks->addr[1], sks->port[1], sks->af); } else printf("-"); } if (s) { if (proto == IPPROTO_TCP) { printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); printf("]"); printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); printf("]"); } printf(" %u:%u", s->src.state, s->dst.state); } } void pf_print_flags(u_int8_t f) { if (f) printf(" "); if (f & TH_FIN) printf("F"); if (f & TH_SYN) printf("S"); if (f & TH_RST) printf("R"); if (f & TH_PUSH) printf("P"); if (f & TH_ACK) printf("A"); if (f & TH_URG) printf("U"); if (f & TH_ECE) printf("E"); if (f & TH_CWR) printf("W"); } #define PF_SET_SKIP_STEPS(i) \ do { \ while (head[i] != cur) { \ head[i]->skip[i].ptr = cur; \ head[i] = TAILQ_NEXT(head[i], entries); \ } \ } while (0) void pf_calc_skip_steps(struct pf_krulequeue *rules) { struct pf_krule *cur, *prev, *head[PF_SKIP_COUNT]; int i; cur = TAILQ_FIRST(rules); prev = cur; for (i = 0; i < PF_SKIP_COUNT; ++i) head[i] = cur; while (cur != NULL) { if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) PF_SET_SKIP_STEPS(PF_SKIP_IFP); if (cur->direction != prev->direction) PF_SET_SKIP_STEPS(PF_SKIP_DIR); if (cur->af != prev->af) PF_SET_SKIP_STEPS(PF_SKIP_AF); if (cur->proto != prev->proto) PF_SET_SKIP_STEPS(PF_SKIP_PROTO); if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); if (cur->src.port[0] != prev->src.port[0] || cur->src.port[1] != prev->src.port[1] || cur->src.port_op != prev->src.port_op) PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); if (cur->dst.port[0] != prev->dst.port[0] || cur->dst.port[1] != prev->dst.port[1] || cur->dst.port_op != prev->dst.port_op) PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); prev = cur; cur = TAILQ_NEXT(cur, entries); } for (i = 0; i < PF_SKIP_COUNT; ++i) PF_SET_SKIP_STEPS(i); } int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { if (aw1->type != aw2->type) return (1); switch (aw1->type) { case PF_ADDR_ADDRMASK: case PF_ADDR_RANGE: if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6)) return (1); if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6)) return (1); return (0); case PF_ADDR_DYNIFTL: return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (aw1->p.tbl != aw2->p.tbl); default: printf("invalid address type: %d\n", aw1->type); return (1); } } /** * Checksum updates are a little complicated because the checksum in the TCP/UDP * header isn't always a full checksum. In some cases (i.e. output) it's a * pseudo-header checksum, which is a partial checksum over src/dst IP * addresses, protocol number and length. * * That means we have the following cases: * * Input or forwarding: we don't have TSO, the checksum fields are full * checksums, we need to update the checksum whenever we change anything. * * Output (i.e. the checksum is a pseudo-header checksum): * x The field being updated is src/dst address or affects the length of * the packet. We need to update the pseudo-header checksum (note that this * checksum is not ones' complement). * x Some other field is being modified (e.g. src/dst port numbers): We * don't have to update anything. **/ u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { u_int32_t x; x = cksum + old - new; x = (x + (x >> 16)) & 0xffff; /* optimise: eliminate a branch when not udp */ if (udp && cksum == 0x0000) return cksum; if (udp && x == 0x0000) x = 0xffff; return (u_int16_t)(x); } static void pf_patch_8(struct mbuf *m, u_int16_t *cksum, u_int8_t *f, u_int8_t v, bool hi, u_int8_t udp) { u_int16_t old = htons(hi ? (*f << 8) : *f); u_int16_t new = htons(hi ? ( v << 8) : v); if (*f == v) return; *f = v; if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) return; *cksum = pf_cksum_fixup(*cksum, old, new, udp); } void pf_patch_16_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int16_t v, bool hi, u_int8_t udp) { u_int8_t *fb = (u_int8_t *)f; u_int8_t *vb = (u_int8_t *)&v; pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); } void pf_patch_32_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int32_t v, bool hi, u_int8_t udp) { u_int8_t *fb = (u_int8_t *)f; u_int8_t *vb = (u_int8_t *)&v; pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); } u_int16_t pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) return (cksum); return (pf_cksum_fixup(cksum, old, new, udp)); } static void pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) { struct pf_addr ao; u_int16_t po = *p; PF_ACPY(&ao, a, af); PF_ACPY(a, an, af); if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) *pc = ~*pc; *p = pn; switch (af) { #ifdef INET case AF_INET: *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; *pc = pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET6 */ } if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { *pc = ~*pc; if (! *pc) *pc = 0xffff; } } /* Changes a u_int32_t. Uses a void * so there are no align restrictions */ void pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), ao % 65536, an % 65536, u); } void pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_proto_cksum_fixup(m, pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp), ao % 65536, an % 65536, udp); } #ifdef INET6 static void pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; PF_ACPY(&ao, a, AF_INET6); PF_ACPY(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*c, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); } #endif /* INET6 */ static void pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) { struct pf_addr oia, ooa; PF_ACPY(&oia, ia, af); if (oa) PF_ACPY(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { u_int16_t oip = *ip; u_int32_t opc; if (pc != NULL) opc = *pc; *ip = np; if (pc != NULL) *pc = pf_cksum_fixup(*pc, oip, *ip, u); *ic = pf_cksum_fixup(*ic, oip, *ip, 0); if (pc != NULL) *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ PF_ACPY(ia, na, af); switch (af) { #ifdef INET case AF_INET: { u_int32_t oh2c = *h2c; *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); break; } #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], u), oia.addr16[1], ia->addr16[1], u), oia.addr16[2], ia->addr16[2], u), oia.addr16[3], ia->addr16[3], u), oia.addr16[4], ia->addr16[4], u), oia.addr16[5], ia->addr16[5], u), oia.addr16[6], ia->addr16[6], u), oia.addr16[7], ia->addr16[7], u); break; #endif /* INET6 */ } /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ if (oa) { PF_ACPY(oa, na, af); switch (af) { #ifdef INET case AF_INET: *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, ooa.addr16[0], oa->addr16[0], 0), ooa.addr16[1], oa->addr16[1], 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, ooa.addr16[0], oa->addr16[0], u), ooa.addr16[1], oa->addr16[1], u), ooa.addr16[2], oa->addr16[2], u), ooa.addr16[3], oa->addr16[3], u), ooa.addr16[4], oa->addr16[4], u), ooa.addr16[5], oa->addr16[5], u), ooa.addr16[6], oa->addr16[6], u), ooa.addr16[7], oa->addr16[7], u); break; #endif /* INET6 */ } } } /* * Need to modulate the sequence numbers in the TCP SACK option * (credits to Krzysztof Pfaff for report and patch) */ static int pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *dst) { int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; u_int8_t opts[TCP_MAXOLEN], *opt = opts; int copyback = 0, i, olen; struct sackblk sack; #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) if (hlen < TCPOLEN_SACKLEN || !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) return 0; while (hlen >= TCPOLEN_SACKLEN) { size_t startoff = opt - opts; olen = opt[1]; switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_SACK: if (olen > hlen) olen = hlen; if (olen >= TCPOLEN_SACKLEN) { for (i = 2; i + TCPOLEN_SACK <= olen; i += TCPOLEN_SACK) { memcpy(&sack, &opt[i], sizeof(sack)); pf_patch_32_unaligned(m, &th->th_sum, &sack.start, htonl(ntohl(sack.start) - dst->seqdiff), PF_ALGNMNT(startoff), 0); pf_patch_32_unaligned(m, &th->th_sum, &sack.end, htonl(ntohl(sack.end) - dst->seqdiff), PF_ALGNMNT(startoff), 0); memcpy(&opt[i], &sack, sizeof(sack)); } copyback = 1; } /* FALLTHROUGH */ default: if (olen < 2) olen = 2; hlen -= olen; opt += olen; } } if (copyback) m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); return (copyback); } struct mbuf * pf_build_tcp(const struct pf_krule *r, sa_family_t af, const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, bool skip_firewall, u_int16_t mtag_tag, u_int16_t mtag_flags, int rtableid) { struct mbuf *m; int len, tlen; #ifdef INET struct ip *h = NULL; #endif /* INET */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif /* INET6 */ struct tcphdr *th; char *opt; struct pf_mtag *pf_mtag; len = 0; th = NULL; /* maximum segment size tcp option */ tlen = sizeof(struct tcphdr); if (mss) tlen += 4; switch (af) { #ifdef INET case AF_INET: len = sizeof(struct ip) + tlen; break; #endif /* INET */ #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr) + tlen; break; #endif /* INET6 */ default: panic("%s: unsupported af %d", __func__, af); } m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #ifdef MAC mac_netinet_firewall_send(m); #endif if ((pf_mtag = pf_get_mtag(m)) == NULL) { m_freem(m); return (NULL); } if (skip_firewall) m->m_flags |= M_SKIP_FIREWALL; pf_mtag->tag = mtag_tag; pf_mtag->flags = mtag_flags; if (rtableid >= 0) M_SETFIB(m, rtableid); #ifdef ALTQ if (r != NULL && r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m, struct ip *); } #endif /* ALTQ */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; /* The rest of the stack assumes a rcvif, so provide one. * This is a locally generated packet, so .. close enough. */ m->m_pkthdr.rcvif = V_loif; bzero(m->m_data, len); switch (af) { #ifdef INET case AF_INET: h = mtod(m, struct ip *); /* IP header fields included in the TCP checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(tlen); h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); /* IP header fields included in the TCP checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(tlen); memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); break; #endif /* INET6 */ } /* TCP header */ th->th_sport = sport; th->th_dport = dport; th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_off = tlen >> 2; th->th_flags = tcp_flags; th->th_win = htons(win); if (mss) { opt = (char *)(th + 1); opt[0] = TCPOPT_MAXSEG; opt[1] = 4; HTONS(mss); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); } switch (af) { #ifdef INET case AF_INET: /* TCP checksum */ th->th_sum = in_cksum(m, len); /* Finish the IP header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); h->ip_len = htons(len); h->ip_ttl = ttl ? ttl : V_ip_defttl; h->ip_sum = 0; break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* TCP checksum */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif /* INET6 */ } return (m); } void pf_send_tcp(const struct pf_krule *r, sa_family_t af, const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, bool skip_firewall, u_int16_t mtag_tag, u_int16_t mtag_flags, int rtableid) { struct pf_send_entry *pfse; struct mbuf *m; m = pf_build_tcp(r, af, saddr, daddr, sport, dport, seq, ack, tcp_flags, win, mss, ttl, skip_firewall, mtag_tag, mtag_flags, rtableid); if (m == NULL) return; /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) { m_freem(m); return; } switch (af) { #ifdef INET case AF_INET: pfse->pfse_type = PFSE_IP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pfse->pfse_type = PFSE_IP6; break; #endif /* INET6 */ } pfse->pfse_m = m; pf_send(pfse); } static void pf_return(struct pf_krule *r, struct pf_krule *nr, struct pf_pdesc *pd, struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th, struct pfi_kkif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, u_short *reason, int rtableid) { struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; /* undo NAT changes, if they have taken place */ if (nr != NULL) { PF_ACPY(saddr, &sk->addr[pd->sidx], af); PF_ACPY(daddr, &sk->addr[pd->didx], af); if (pd->sport) *pd->sport = sk->port[pd->sidx]; if (pd->dport) *pd->dport = sk->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } if (pd->proto == IPPROTO_TCP && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURN)) && !(th->th_flags & TH_RST)) { u_int32_t ack = ntohl(th->th_seq) + pd->p_len; int len = 0; #ifdef INET struct ip *h4; #endif #ifdef INET6 struct ip6_hdr *h6; #endif switch (af) { #ifdef INET case AF_INET: h4 = mtod(m, struct ip *); len = ntohs(h4->ip_len) - off; break; #endif #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); break; #endif } if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) REASON_SET(reason, PFRES_PROTCKSUM); else { if (th->th_flags & TH_SYN) ack++; if (th->th_flags & TH_FIN) ack++; pf_send_tcp(r, af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, r->return_ttl, true, 0, 0, rtableid); } } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r, rtableid); else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r, rtableid); } static int pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m) { struct m_tag *mtag; u_int8_t mpcp; mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) return (0); if (prio == PF_PRIO_ZERO) prio = 0; mpcp = *(uint8_t *)(mtag + 1); return (mpcp == prio); } static int pf_icmp_to_bandlim(uint8_t type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: return (BANDLIM_ICMP_ECHO); case ICMP_TSTAMP: case ICMP_TSTAMPREPLY: return (BANDLIM_ICMP_TSTAMP); case ICMP_UNREACH: default: return (BANDLIM_ICMP_UNREACH); } } static void pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, struct pf_krule *r, int rtableid) { struct pf_send_entry *pfse; struct mbuf *m0; struct pf_mtag *pf_mtag; /* ICMP packet rate limitation. */ #ifdef INET6 if (af == AF_INET6) { if (icmp6_ratelimit(NULL, type, code)) return; } #endif #ifdef INET if (af == AF_INET) { if (badport_bandlim(pf_icmp_to_bandlim(type)) != 0) return; } #endif /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { free(pfse, M_PFTEMP); return; } if ((pf_mtag = pf_get_mtag(m0)) == NULL) { free(pfse, M_PFTEMP); return; } /* XXX: revisit */ m0->m_flags |= M_SKIP_FIREWALL; if (rtableid >= 0) M_SETFIB(m0, rtableid); #ifdef ALTQ if (r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m0, struct ip *); } #endif /* ALTQ */ switch (af) { #ifdef INET case AF_INET: pfse->pfse_type = PFSE_ICMP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pfse->pfse_type = PFSE_ICMP6; break; #endif /* INET6 */ } pfse->pfse_m = m0; pfse->icmpopts.type = type; pfse->icmpopts.code = code; pf_send(pfse); } /* * Return 1 if the addresses a and b match (with mask m), otherwise return 0. * If n is 0, they match if they are equal. If n is != 0, they match if they * are different. */ int pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, struct pf_addr *b, sa_family_t af) { int match = 0; switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) match++; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) && ((a->addr32[1] & m->addr32[1]) == (b->addr32[1] & m->addr32[1])) && ((a->addr32[2] & m->addr32[2]) == (b->addr32[2] & m->addr32[2])) && ((a->addr32[3] & m->addr32[3]) == (b->addr32[3] & m->addr32[3]))) match++; break; #endif /* INET6 */ } if (match) { if (n) return (0); else return (1); } else { if (n) return (1); else return (0); } } /* * Return 1 if b <= a <= e, otherwise return 0. */ int pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) || (ntohl(a->addr32[0]) > ntohl(e->addr32[0]))) return (0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: { int i; /* check a >= b */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) > ntohl(b->addr32[i])) break; else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i])) return (0); /* check a <= e */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) < ntohl(e->addr32[i])) break; else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i])) return (0); break; } #endif /* INET6 */ } return (1); } static int pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) { switch (op) { case PF_OP_IRG: return ((p > a1) && (p < a2)); case PF_OP_XRG: return ((p < a1) || (p > a2)); case PF_OP_RRG: return ((p >= a1) && (p <= a2)); case PF_OP_EQ: return (p == a1); case PF_OP_NE: return (p != a1); case PF_OP_LT: return (p < a1); case PF_OP_LE: return (p <= a1); case PF_OP_GT: return (p > a1); case PF_OP_GE: return (p >= a1); } return (0); /* never reached */ } int pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) { NTOHS(a1); NTOHS(a2); NTOHS(p); return (pf_match(op, a1, a2, p)); } static int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } static int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } int pf_match_tag(struct mbuf *m, struct pf_krule *r, int *tag, int mtag) { if (*tag == -1) *tag = mtag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } int pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) { KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) return (ENOMEM); pd->pf_mtag->tag = tag; return (0); } #define PF_ANCHOR_STACKSIZE 32 struct pf_kanchor_stackframe { struct pf_kruleset *rs; struct pf_krule *r; /* XXX: + match bit */ struct pf_kanchor *child; }; /* * XXX: We rely on malloc(9) returning pointer aligned addresses. */ #define PF_ANCHORSTACK_MATCH 0x00000001 #define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_RULE(f) (struct pf_krule *) \ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) #define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ } while (0) void pf_step_into_anchor(struct pf_kanchor_stackframe *stack, int *depth, struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a, int *match) { struct pf_kanchor_stackframe *f; PF_RULES_RASSERT(); if (match) *match = 0; if (*depth >= PF_ANCHOR_STACKSIZE) { printf("%s: anchor stack overflow on %s\n", __func__, (*r)->anchor->name); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { struct pf_kanchor_node *parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_kanchor_node, parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); } int pf_step_out_of_anchor(struct pf_kanchor_stackframe *stack, int *depth, struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a, int *match) { struct pf_kanchor_stackframe *f; struct pf_krule *fr; int quick = 0; PF_RULES_RASSERT(); do { if (*depth <= 0) break; f = stack + *depth - 1; fr = PF_ANCHOR_RULE(f); if (f->child != NULL) { /* * This block traverses through * a wildcard anchor. */ if (match != NULL && *match) { /* * If any of "*" matched, then * "foo/ *" matched, mark frame * appropriately. */ PF_ANCHOR_SET_MATCH(f); *match = 0; } f->child = RB_NEXT(pf_kanchor_node, &fr->anchor->children, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (PF_ANCHOR_MATCH(f) || (match != NULL && *match)) quick = fr->quick; *r = TAILQ_NEXT(fr, entries); } while (*r == NULL); return (quick); } struct pf_keth_anchor_stackframe { struct pf_keth_ruleset *rs; struct pf_keth_rule *r; /* XXX: + match bit */ struct pf_keth_anchor *child; }; #define PF_ETH_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) #define PF_ETH_ANCHOR_RULE(f) (struct pf_keth_rule *) \ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) #define PF_ETH_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ } while (0) void pf_step_into_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth, struct pf_keth_ruleset **rs, struct pf_keth_rule **r, struct pf_keth_rule **a, int *match) { struct pf_keth_anchor_stackframe *f; NET_EPOCH_ASSERT(); if (match) *match = 0; if (*depth >= PF_ANCHOR_STACKSIZE) { printf("%s: anchor stack overflow on %s\n", __func__, (*r)->anchor->name); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { struct pf_keth_anchor_node *parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_keth_anchor_node, parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->active.rules); } int pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth, struct pf_keth_ruleset **rs, struct pf_keth_rule **r, struct pf_keth_rule **a, int *match) { struct pf_keth_anchor_stackframe *f; struct pf_keth_rule *fr; int quick = 0; NET_EPOCH_ASSERT(); do { if (*depth <= 0) break; f = stack + *depth - 1; fr = PF_ETH_ANCHOR_RULE(f); if (f->child != NULL) { /* * This block traverses through * a wildcard anchor. */ if (match != NULL && *match) { /* * If any of "*" matched, then * "foo/ *" matched, mark frame * appropriately. */ PF_ETH_ANCHOR_SET_MATCH(f); *match = 0; } f->child = RB_NEXT(pf_keth_anchor_node, &fr->anchor->children, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->active.rules); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (PF_ETH_ANCHOR_MATCH(f) || (match != NULL && *match)) quick = fr->quick; *r = TAILQ_NEXT(fr, entries); } while (*r == NULL); return (quick); } #ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; } } void pf_addr_inc(struct pf_addr *addr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; if (addr->addr32[2] == 0xffffffff) { addr->addr32[2] = 0; if (addr->addr32[1] == 0xffffffff) { addr->addr32[1] = 0; addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); } else addr->addr32[1] = htonl(ntohl(addr->addr32[1]) + 1); } else addr->addr32[2] = htonl(ntohl(addr->addr32[2]) + 1); } else addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; } } #endif /* INET6 */ void pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a) { /* * Modern rules use the same flags in rules as they do in states. */ a->flags |= (r->scrub_flags & (PFSTATE_NODF|PFSTATE_RANDOMID| PFSTATE_SCRUB_TCP|PFSTATE_SETPRIO)); /* * Old-style scrub rules have different flags which need to be translated. */ if (r->rule_flag & PFRULE_RANDOMID) a->flags |= PFSTATE_RANDOMID; if (r->scrub_flags & PFSTATE_SETTOS || r->rule_flag & PFRULE_SET_TOS ) { a->flags |= PFSTATE_SETTOS; a->set_tos = r->set_tos; } if (r->qid) a->qid = r->qid; if (r->pqid) a->pqid = r->pqid; if (r->rtableid >= 0) a->rtableid = r->rtableid; a->log |= r->log; if (r->min_ttl) a->min_ttl = r->min_ttl; if (r->max_mss) a->max_mss = r->max_mss; if (r->dnpipe) a->dnpipe = r->dnpipe; if (r->dnrpipe) a->dnrpipe = r->dnrpipe; if (r->dnpipe || r->dnrpipe) { if (r->free_flags & PFRULE_DN_IS_PIPE) a->flags |= PFSTATE_DN_IS_PIPE; else a->flags &= ~PFSTATE_DN_IS_PIPE; } if (r->scrub_flags & PFSTATE_SETPRIO) { a->set_prio[0] = r->set_prio[0]; a->set_prio[1] = r->set_prio[1]; } } int pf_socket_lookup(struct pf_pdesc *pd, struct mbuf *m) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; struct inpcbinfo *pi; struct inpcb *inp; pd->lookup.uid = UID_MAX; pd->lookup.gid = GID_MAX; switch (pd->proto) { case IPPROTO_TCP: sport = pd->hdr.tcp.th_sport; dport = pd->hdr.tcp.th_dport; pi = &V_tcbinfo; break; case IPPROTO_UDP: sport = pd->hdr.udp.uh_sport; dport = pd->hdr.udp.uh_dport; pi = &V_udbinfo; break; default: return (-1); } if (pd->dir == PF_IN) { saddr = pd->src; daddr = pd->dst; } else { u_int16_t p; p = sport; sport = dport; dport = p; saddr = pd->dst; daddr = pd->src; } switch (pd->af) { #ifdef INET case AF_INET: inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET6 */ default: return (-1); } INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; INP_RUNLOCK(inp); return (1); } u_int8_t pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int8_t wscale = 0; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= 3) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_WINDOW: wscale = opt[2]; if (wscale > TCP_MAX_WINSHIFT) wscale = TCP_MAX_WINSHIFT; wscale |= PF_WSCALE_FLAG; /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (wscale); } u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int16_t mss = V_tcp_mssdflt; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= TCPOLEN_MAXSEG) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_MAXSEG: bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); NTOHS(mss); /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (mss); } static u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) { struct nhop_object *nh; #ifdef INET6 struct in6_addr dst6; uint32_t scopeid; #endif /* INET6 */ int hlen = 0; uint16_t mss = 0; NET_EPOCH_ASSERT(); switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); nh = fib4_lookup(rtableid, addr->v4, 0, 0, 0); if (nh != NULL) mss = nh->nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); in6_splitscope(&addr->v6, &dst6, &scopeid); nh = fib6_lookup(rtableid, &dst6, scopeid, 0, 0); if (nh != NULL) mss = nh->nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET6 */ } mss = max(V_tcp_mssdflt, mss); mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); } static u_int32_t pf_tcp_iss(struct pf_pdesc *pd) { MD5_CTX ctx; u_int32_t digest[4]; if (V_pf_tcp_secret_init == 0) { arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); MD5Init(&V_pf_tcp_secret_ctx); MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); V_pf_tcp_secret_init = 1; } ctx = V_pf_tcp_secret_ctx; MD5Update(&ctx, (char *)&pd->hdr.tcp.th_sport, sizeof(u_short)); MD5Update(&ctx, (char *)&pd->hdr.tcp.th_dport, sizeof(u_short)); if (pd->af == AF_INET6) { MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); } else { MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); } MD5Final((u_char *)digest, &ctx); V_pf_tcp_iss_off += 4096; #define ISN_RANDOM_INCREMENT (4096 - 1) return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + V_pf_tcp_iss_off); #undef ISN_RANDOM_INCREMENT } static bool pf_match_eth_addr(const uint8_t *a, const struct pf_keth_rule_addr *r) { bool match = true; /* Always matches if not set */ if (! r->isset) return (!r->neg); for (int i = 0; i < ETHER_ADDR_LEN; i++) { if ((a[i] & r->mask[i]) != (r->addr[i] & r->mask[i])) { match = false; break; } } return (match ^ r->neg); } static int pf_match_eth_tag(struct mbuf *m, struct pf_keth_rule *r, int *tag, int mtag) { if (*tag == -1) *tag = mtag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } static void pf_bridge_to(struct ifnet *ifp, struct mbuf *m) { /* If we don't have the interface drop the packet. */ if (ifp == NULL) { m_freem(m); return; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: break; default: m_freem(m); return; } ifp->if_transmit(ifp, m); } static int pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0) { #ifdef INET struct ip ip; #endif #ifdef INET6 struct ip6_hdr ip6; #endif struct mbuf *m = *m0; struct ether_header *e; struct pf_keth_rule *r, *rm, *a = NULL; struct pf_keth_ruleset *ruleset = NULL; struct pf_mtag *mtag; struct pf_keth_ruleq *rules; struct pf_addr *src = NULL, *dst = NULL; struct pfi_kkif *bridge_to; sa_family_t af = 0; uint16_t proto; int asd = 0, match = 0; int tag = -1; uint8_t action; struct pf_keth_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; MPASS(kif->pfik_ifp->if_vnet == curvnet); NET_EPOCH_ASSERT(); PF_RULES_RLOCK_TRACKER; SDT_PROBE3(pf, eth, test_rule, entry, dir, kif->pfik_ifp, m); mtag = pf_find_mtag(m); if (mtag != NULL && mtag->flags & PF_MTAG_FLAG_DUMMYNET) { /* Dummynet re-injects packets after they've * completed their delay. We've already * processed them, so pass unconditionally. */ /* But only once. We may see the packet multiple times (e.g. * PFIL_IN/PFIL_OUT). */ mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET; return (PF_PASS); } ruleset = V_pf_keth; rules = ck_pr_load_ptr(&ruleset->active.rules); r = TAILQ_FIRST(rules); rm = NULL; e = mtod(m, struct ether_header *); proto = ntohs(e->ether_type); switch (proto) { #ifdef INET case ETHERTYPE_IP: { if (m_length(m, NULL) < (sizeof(struct ether_header) + sizeof(ip))) return (PF_DROP); af = AF_INET; m_copydata(m, sizeof(struct ether_header), sizeof(ip), (caddr_t)&ip); src = (struct pf_addr *)&ip.ip_src; dst = (struct pf_addr *)&ip.ip_dst; break; } #endif /* INET */ #ifdef INET6 case ETHERTYPE_IPV6: { if (m_length(m, NULL) < (sizeof(struct ether_header) + sizeof(ip6))) return (PF_DROP); af = AF_INET6; m_copydata(m, sizeof(struct ether_header), sizeof(ip6), (caddr_t)&ip6); src = (struct pf_addr *)&ip6.ip6_src; dst = (struct pf_addr *)&ip6.ip6_dst; break; } #endif /* INET6 */ } PF_RULES_RLOCK(); while (r != NULL) { counter_u64_add(r->evaluations, 1); SDT_PROBE2(pf, eth, test_rule, test, r->nr, r); if (pfi_kkif_match(r->kif, kif) == r->ifnot) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "kif"); r = r->skip[PFE_SKIP_IFP].ptr; } else if (r->direction && r->direction != dir) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "dir"); r = r->skip[PFE_SKIP_DIR].ptr; } else if (r->proto && r->proto != proto) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "proto"); r = r->skip[PFE_SKIP_PROTO].ptr; } else if (! pf_match_eth_addr(e->ether_shost, &r->src)) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "src"); r = r->skip[PFE_SKIP_SRC_ADDR].ptr; } else if (! pf_match_eth_addr(e->ether_dhost, &r->dst)) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "dst"); r = r->skip[PFE_SKIP_DST_ADDR].ptr; } else if (src != NULL && PF_MISMATCHAW(&r->ipsrc.addr, src, af, r->ipsrc.neg, kif, M_GETFIB(m))) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "ip_src"); r = r->skip[PFE_SKIP_SRC_IP_ADDR].ptr; } else if (dst != NULL && PF_MISMATCHAW(&r->ipdst.addr, dst, af, r->ipdst.neg, kif, M_GETFIB(m))) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "ip_dst"); r = r->skip[PFE_SKIP_DST_IP_ADDR].ptr; } else if (r->match_tag && !pf_match_eth_tag(m, r, &tag, mtag ? mtag->tag : 0)) { SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r, "match_tag"); r = TAILQ_NEXT(r, entries); } else { if (r->tag) tag = r->tag; if (r->anchor == NULL) { /* Rule matches */ rm = r; SDT_PROBE2(pf, eth, test_rule, match, r->nr, r); if (r->quick) break; r = TAILQ_NEXT(r, entries); } else { pf_step_into_keth_anchor(anchor_stack, &asd, &ruleset, &r, &a, &match); } } if (r == NULL && pf_step_out_of_keth_anchor(anchor_stack, &asd, &ruleset, &r, &a, &match)) break; } r = rm; SDT_PROBE2(pf, eth, test_rule, final_match, (r != NULL ? r->nr : -1), r); /* Default to pass. */ if (r == NULL) { PF_RULES_RUNLOCK(); return (PF_PASS); } /* Execute action. */ counter_u64_add(r->packets[dir == PF_OUT], 1); counter_u64_add(r->bytes[dir == PF_OUT], m_length(m, NULL)); pf_update_timestamp(r); /* Shortcut. Don't tag if we're just going to drop anyway. */ if (r->action == PF_DROP) { PF_RULES_RUNLOCK(); return (PF_DROP); } if (tag > 0) { if (mtag == NULL) mtag = pf_get_mtag(m); if (mtag == NULL) { PF_RULES_RUNLOCK(); counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1); return (PF_DROP); } mtag->tag = tag; } if (r->qid != 0) { if (mtag == NULL) mtag = pf_get_mtag(m); if (mtag == NULL) { PF_RULES_RUNLOCK(); counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1); return (PF_DROP); } mtag->qid = r->qid; } action = r->action; bridge_to = r->bridge_to; /* Dummynet */ if (r->dnpipe) { struct ip_fw_args dnflow; /* Drop packet if dummynet is not loaded. */ if (ip_dn_io_ptr == NULL) { PF_RULES_RUNLOCK(); m_freem(m); counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1); return (PF_DROP); } if (mtag == NULL) mtag = pf_get_mtag(m); if (mtag == NULL) { PF_RULES_RUNLOCK(); counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1); return (PF_DROP); } bzero(&dnflow, sizeof(dnflow)); /* We don't have port numbers here, so we set 0. That means * that we'll be somewhat limited in distinguishing flows (i.e. * only based on IP addresses, not based on port numbers), but * it's better than nothing. */ dnflow.f_id.dst_port = 0; dnflow.f_id.src_port = 0; dnflow.f_id.proto = 0; dnflow.rule.info = r->dnpipe; dnflow.rule.info |= IPFW_IS_DUMMYNET; if (r->dnflags & PFRULE_DN_IS_PIPE) dnflow.rule.info |= IPFW_IS_PIPE; dnflow.f_id.extra = dnflow.rule.info; dnflow.flags = dir == PF_IN ? IPFW_ARGS_IN : IPFW_ARGS_OUT; dnflow.flags |= IPFW_ARGS_ETHER; dnflow.ifp = kif->pfik_ifp; switch (af) { case AF_INET: dnflow.f_id.addr_type = 4; dnflow.f_id.src_ip = src->v4.s_addr; dnflow.f_id.dst_ip = dst->v4.s_addr; break; case AF_INET6: dnflow.flags |= IPFW_ARGS_IP6; dnflow.f_id.addr_type = 6; dnflow.f_id.src_ip6 = src->v6; dnflow.f_id.dst_ip6 = dst->v6; break; } PF_RULES_RUNLOCK(); mtag->flags |= PF_MTAG_FLAG_DUMMYNET; ip_dn_io_ptr(m0, &dnflow); if (*m0 != NULL) mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET; } else { PF_RULES_RUNLOCK(); } if (action == PF_PASS && bridge_to) { pf_bridge_to(bridge_to->pfik_ifp, *m0); *m0 = NULL; /* We've eaten the packet. */ } return (action); } static int pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_krule **am, struct pf_kruleset **rsm, struct inpcb *inp) { struct pf_krule *nr = NULL; struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; struct pf_krule *r, *a = NULL; struct pf_kruleset *ruleset = NULL; struct pf_krule_slist match_rules; struct pf_krule_item *ri; struct pf_ksrc_node *nsn = NULL; struct tcphdr *th = &pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; u_short reason; int rewrite = 0, hdrlen = 0; int tag = -1; int asd = 0; int match = 0; int state_icmp = 0; u_int16_t sport = 0, dport = 0; u_int16_t bproto_sum = 0, bip_sum = 0; u_int8_t icmptype = 0, icmpcode = 0; struct pf_kanchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; pd->lookup.done = 1; } switch (pd->proto) { case IPPROTO_TCP: sport = th->th_sport; dport = th->th_dport; hdrlen = sizeof(*th); break; case IPPROTO_UDP: sport = pd->hdr.udp.uh_sport; dport = pd->hdr.udp.uh_dport; hdrlen = sizeof(pd->hdr.udp); break; + case IPPROTO_SCTP: + sport = pd->hdr.sctp.src_port; + dport = pd->hdr.sctp.dest_port; + hdrlen = sizeof(pd->hdr.sctp); + break; #ifdef INET case IPPROTO_ICMP: if (pd->af != AF_INET) break; sport = dport = pd->hdr.icmp.icmp_id; hdrlen = sizeof(pd->hdr.icmp); icmptype = pd->hdr.icmp.icmp_type; icmpcode = pd->hdr.icmp.icmp_code; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: if (af != AF_INET6) break; sport = dport = pd->hdr.icmp6.icmp6_id; hdrlen = sizeof(pd->hdr.icmp6); icmptype = pd->hdr.icmp6.icmp6_type; icmpcode = pd->hdr.icmp6.icmp6_code; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ default: sport = dport = hdrlen = 0; break; } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ if ((nr = pf_get_translation(pd, m, off, kif, &nsn, &sk, &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { KASSERT(sk != NULL, ("%s: null sk", __func__)); KASSERT(nk != NULL, ("%s: null nk", __func__)); if (nr->log) { PFLOG_PACKET(kif, m, af, PFRES_MATCH, nr, a, ruleset, pd, 1); } if (pd->ip_sum) bip_sum = *pd->ip_sum; switch (pd->proto) { case IPPROTO_TCP: bproto_sum = th->th_sum; pd->proto_sum = &th->th_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, af); pd->sport = &th->th_sport; sport = th->th_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, af); dport = th->th_dport; pd->dport = &th->th_dport; } rewrite++; break; case IPPROTO_UDP: bproto_sum = pd->hdr.udp.uh_sum; pd->proto_sum = &pd->hdr.udp.uh_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &pd->hdr.udp.uh_sport, pd->ip_sum, &pd->hdr.udp.uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, af); sport = pd->hdr.udp.uh_sport; pd->sport = &pd->hdr.udp.uh_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &pd->hdr.udp.uh_dport, pd->ip_sum, &pd->hdr.udp.uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, af); dport = pd->hdr.udp.uh_dport; pd->dport = &pd->hdr.udp.uh_dport; } rewrite++; break; #ifdef INET case IPPROTO_ICMP: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[1] != pd->hdr.icmp.icmp_id) { pd->hdr.icmp.icmp_cksum = pf_cksum_fixup( pd->hdr.icmp.icmp_cksum, sport, nk->port[1], 0); pd->hdr.icmp.icmp_id = nk->port[1]; pd->sport = &pd->hdr.icmp.icmp_id; } m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->didx], 0); rewrite++; break; #endif /* INET */ default: switch (af) { #ifdef INET case AF_INET: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) PF_ACPY(saddr, &nk->addr[pd->sidx], af); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) PF_ACPY(daddr, &nk->addr[pd->didx], af); break; #endif /* INET */ } break; } if (nr->natpass) r = NULL; pd->nat_rule = nr; } SLIST_INIT(&match_rules); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ else if (r->type && r->type != icmptype + 1) r = TAILQ_NEXT(r, entries); /* icmp only. type always 0 in other cases */ else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); /* tcp/udp only. uid.op always 0 in other cases */ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(pd, m), 1)) && !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); /* tcp/udp only. gid.op always 0 in other cases */ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(pd, m), 1)) && !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->anchor == NULL) { if (r->action == PF_MATCH) { ri = malloc(sizeof(struct pf_krule_item), M_PF_RULE_ITEM, M_NOWAIT | M_ZERO); if (ri == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } ri->r = r; SLIST_INSERT_HEAD(&match_rules, ri, entry); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); if (r->log) PFLOG_PACKET(kif, m, af, PFRES_MATCH, r, a, ruleset, pd, 1); } else { match = 1; *rm = r; *am = a; *rsm = ruleset; } if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); /* apply actions for last matching pass/block rule */ pf_rule_to_actions(r, &pd->act); if (r->log) { if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); PFLOG_PACKET(kif, m, af, reason, r, a, ruleset, pd, 1); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason, r->rtableid); } if (r->action == PF_DROP) goto cleanup; if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } if (pd->act.rtableid >= 0) M_SETFIB(m, pd->act.rtableid); if (!state_icmp && (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM))) { int action; action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, hdrlen, &match_rules); if (action != PF_PASS) { if (action == PF_DROP && (r->rule_flag & PFRULE_RETURN)) pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason, pd->act.rtableid); return (action); } } else { uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && pd->dir == PF_OUT && V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m)) /* * We want the state created, but we dont * want to send this in case a partner * firewall has to know about it to allow * replies through it. */ return (PF_DEFER); return (PF_PASS); cleanup: while ((ri = SLIST_FIRST(&match_rules))) { SLIST_REMOVE_HEAD(&match_rules, entry); free(ri, M_PF_RULE_ITEM); } uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); return (PF_DROP); } static int pf_create_state(struct pf_krule *r, struct pf_krule *nr, struct pf_krule *a, struct pf_pdesc *pd, struct pf_ksrc_node *nsn, struct pf_state_key *nk, struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm, int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, struct pf_krule_slist *match_rules) { struct pf_kstate *s = NULL; struct pf_ksrc_node *sn = NULL; struct tcphdr *th = &pd->hdr.tcp; u_int16_t mss = V_tcp_mssdflt; u_short reason, sn_reason; /* check maximums */ if (r->max_states && (counter_u64_fetch(r->states_cur) >= r->max_states)) { counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); REASON_SET(&reason, PFRES_MAXSTATES); goto csfailed; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && (sn_reason = pf_insert_src_node(&sn, r, pd->src, pd->af)) != 0) { REASON_SET(&reason, sn_reason); goto csfailed; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && (sn_reason = pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) != 0 ) { REASON_SET(&reason, sn_reason); goto csfailed; } s = pf_alloc_state(M_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto csfailed; } s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; bcopy(match_rules, &s->match_rules, sizeof(s->match_rules)); memcpy(&s->act, &pd->act, sizeof(struct pf_rule_actions)); STATE_INC_COUNTERS(s); if (r->allow_opts) s->state_flags |= PFSTATE_ALLOWOPTS; if (r->rule_flag & PFRULE_STATESLOPPY) s->state_flags |= PFSTATE_SLOPPY; if (pd->flags & PFDESC_TCP_NORM) /* Set by old-style scrub rules */ s->state_flags |= PFSTATE_SCRUB_TCP; s->act.log = pd->act.log & PF_LOG_ALL; s->sync_state = PFSYNC_S_NONE; s->state_flags |= pd->act.flags; /* Only needed for pfsync and state export */ if (nr != NULL) s->act.log |= nr->log & PF_LOG_ALL; switch (pd->proto) { case IPPROTO_TCP: s->src.seqlo = ntohl(th->th_seq); s->src.seqhi = s->src.seqlo + pd->p_len + 1; if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_MODULATE) { /* Generate sequence number modulator */ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == 0) s->src.seqdiff = 1; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(s->src.seqlo + s->src.seqdiff), 0); *rewrite = 1; } else s->src.seqdiff = 0; if (th->th_flags & TH_SYN) { s->src.seqhi++; s->src.wscale = pf_get_wscale(m, off, th->th_off, pd->af); } s->src.max_win = MAX(ntohs(th->th_win), 1); if (s->src.wscale & PF_WSCALE_MASK) { /* Remove scale factor from initial window */ int win = s->src.max_win; win += 1 << (s->src.wscale & PF_WSCALE_MASK); s->src.max_win = (win - 1) >> (s->src.wscale & PF_WSCALE_MASK); } if (th->th_flags & TH_FIN) s->src.seqhi++; s->dst.seqhi = 1; s->dst.max_win = 1; pf_set_protostate(s, PF_PEER_SRC, TCPS_SYN_SENT); pf_set_protostate(s, PF_PEER_DST, TCPS_CLOSED); s->timeout = PFTM_TCP_FIRST_PACKET; atomic_add_32(&V_pf_status.states_halfopen, 1); break; case IPPROTO_UDP: pf_set_protostate(s, PF_PEER_SRC, PFUDPS_SINGLE); pf_set_protostate(s, PF_PEER_DST, PFUDPS_NO_TRAFFIC); s->timeout = PFTM_UDP_FIRST_PACKET; break; + case IPPROTO_SCTP: + pf_set_protostate(s, PF_PEER_SRC, SCTP_COOKIE_WAIT); + pf_set_protostate(s, PF_PEER_DST, SCTP_CLOSED); + s->timeout = PFTM_TCP_FIRST_PACKET; + break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif s->timeout = PFTM_ICMP_FIRST_PACKET; break; default: pf_set_protostate(s, PF_PEER_SRC, PFOTHERS_SINGLE); pf_set_protostate(s, PF_PEER_DST, PFOTHERS_NO_TRAFFIC); s->timeout = PFTM_OTHER_FIRST_PACKET; } if (r->rt) { /* pf_map_addr increases the reason counters */ if ((reason = pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) != 0) { pf_src_tree_remove_state(s); s->timeout = PFTM_UNLINKED; STATE_DEC_COUNTERS(s); pf_free_state(s); goto csfailed; } s->rt_kif = r->rpool.cur->kif; s->rt = r->rt; } s->creation = time_uptime; s->expire = time_uptime; if (sn != NULL) s->src_node = sn; if (nsn != NULL) { /* XXX We only modify one side for now. */ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); s->nat_src_node = nsn; } if (pd->proto == IPPROTO_TCP) { if (s->state_flags & PFSTATE_SCRUB_TCP && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { REASON_SET(&reason, PFRES_MEMORY); pf_src_tree_remove_state(s); s->timeout = PFTM_UNLINKED; STATE_DEC_COUNTERS(s); pf_free_state(s); return (PF_DROP); } if (s->state_flags & PFSTATE_SCRUB_TCP && s->src.scrub && pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, &s->src, &s->dst, rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, ("pf_normalize_tcp_stateful failed on first " "pkt\n")); pf_src_tree_remove_state(s); s->timeout = PFTM_UNLINKED; STATE_DEC_COUNTERS(s); pf_free_state(s); return (PF_DROP); } } s->direction = pd->dir; /* * sk/nk could already been setup by pf_get_translation(). */ if (nr == NULL) { KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); if (sk == NULL) goto csfailed; nk = sk; } else KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); /* Swap sk/nk for PF_OUT. */ if (pf_state_insert(BOUND_IFACE(r, kif), kif, (pd->dir == PF_IN) ? sk : nk, (pd->dir == PF_IN) ? nk : sk, s)) { REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); s->timeout = PFTM_UNLINKED; STATE_DEC_COUNTERS(s); pf_free_state(s); return (PF_DROP); } else *sm = s; if (tag > 0) s->tag = tag; if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC); /* undo NAT changes, if they have taken place */ if (nr != NULL) { struct pf_state_key *skt = s->key[PF_SK_WIRE]; if (pd->dir == PF_OUT) skt = s->key[PF_SK_STACK]; PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); if (pd->sport) *pd->sport = skt->port[pd->sidx]; if (pd->dport) *pd->dport = skt->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } s->src.seqhi = htonl(arc4random()); /* Find mss option */ int rtid = M_GETFIB(m); mss = pf_get_mss(m, off, th->th_off, pd->af); mss = pf_calc_mss(pd->src, pd->af, rtid, mss); mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); s->src.mss = mss; pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, s->src.mss, 0, true, 0, 0, pd->act.rtableid); REASON_SET(&reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } return (PF_PASS); csfailed: uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); if (sn != NULL) { PF_SRC_NODE_LOCK(sn); if (--sn->states == 0 && sn->expire == 0) { pf_unlink_src_node(sn); uma_zfree(V_pf_sources_z, sn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_SRC_NODE_UNLOCK(sn); } if (nsn != sn && nsn != NULL) { PF_SRC_NODE_LOCK(nsn); if (--nsn->states == 0 && nsn->expire == 0) { pf_unlink_src_node(nsn); uma_zfree(V_pf_sources_z, nsn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_SRC_NODE_UNLOCK(nsn); } return (PF_DROP); } static int pf_test_fragment(struct pf_krule **rm, struct pfi_kkif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_krule **am, struct pf_kruleset **rsm) { struct pf_krule *r, *a = NULL; struct pf_kruleset *ruleset = NULL; struct pf_krule_slist match_rules; struct pf_krule_item *ri; sa_family_t af = pd->af; u_short reason; int tag = -1; int asd = 0; int match = 0; struct pf_kanchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); SLIST_INIT(&match_rules); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->src.port_op || r->dst.port_op || r->flagset)) r = TAILQ_NEXT(r, entries); else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= (arc4random() % (UINT_MAX - 1) + 1)) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else { if (r->anchor == NULL) { if (r->action == PF_MATCH) { ri = malloc(sizeof(struct pf_krule_item), M_PF_RULE_ITEM, M_NOWAIT | M_ZERO); if (ri == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } ri->r = r; SLIST_INSERT_HEAD(&match_rules, ri, entry); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); if (r->log) PFLOG_PACKET(kif, m, af, PFRES_MATCH, r, a, ruleset, pd, 1); } else { match = 1; *rm = r; *am = a; *rsm = ruleset; } if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); /* apply actions for last matching pass/block rule */ pf_rule_to_actions(r, &pd->act); if (r->log) PFLOG_PACKET(kif, m, af, reason, r, a, ruleset, pd, 1); if (r->action != PF_PASS) return (PF_DROP); if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } return (PF_PASS); cleanup: while ((ri = SLIST_FIRST(&match_rules))) { SLIST_REMOVE_HEAD(&match_rules, entry); free(ri, M_PF_RULE_ITEM); } return (PF_DROP); } static int pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, int *copyback) { struct tcphdr *th = &pd->hdr.tcp; struct pf_state_peer *src, *dst; u_int16_t win = ntohs(th->th_win); u_int32_t ack, end, seq, orig_seq; u_int8_t sws, dws, psrc, pdst; int ackskew; if (pd->dir == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; psrc = PF_PEER_SRC; pdst = PF_PEER_DST; } else { src = &(*state)->dst; dst = &(*state)->src; psrc = PF_PEER_DST; pdst = PF_PEER_SRC; } if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { sws = src->wscale & PF_WSCALE_MASK; dws = dst->wscale & PF_WSCALE_MASK; } else sws = dws = 0; /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(th->th_seq); if (src->seqlo == 0) { /* First packet from this end. Set its state */ if (((*state)->state_flags & PFSTATE_SCRUB_TCP || dst->scrub) && src->scrub == NULL) { if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } } /* Deferred generation of sequence number modulator */ if (dst->seqdiff && !src->seqdiff) { /* use random iss for the TCP server */ while ((src->seqdiff = arc4random() - seq) == 0) ; ack = ntohl(th->th_ack) - dst->seqdiff; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } else { ack = ntohl(th->th_ack); } end = seq + pd->p_len; if (th->th_flags & TH_SYN) { end++; if (dst->wscale & PF_WSCALE_FLAG) { src->wscale = pf_get_wscale(m, off, th->th_off, pd->af); if (src->wscale & PF_WSCALE_FLAG) { /* Remove scale factor from initial * window */ sws = src->wscale & PF_WSCALE_MASK; win = ((u_int32_t)win + (1 << sws) - 1) >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (th->th_flags & TH_FIN) end++; src->seqlo = seq; if (src->state < TCPS_SYN_SENT) pf_set_protostate(*state, psrc, TCPS_SYN_SENT); /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); if (win > src->max_win) src->max_win = win; } else { ack = ntohl(th->th_ack) - dst->seqdiff; if (src->seqdiff) { /* Modulate sequence numbers */ pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; if (th->th_flags & TH_FIN) end++; } if ((th->th_flags & TH_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || /* broken tcp stacks do not set ack */ (dst->state < TCPS_SYN_SENT)) { /* * Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; /* * Need to demodulate the sequence numbers in any TCP SACK options * (Selective ACK). We could optionally validate the SACK values * against the current ACK window, either forwards or backwards, but * I'm not confident that SACK has been implemented properly * everywhere. It wouldn't surprise me if several stacks accidentally * SACK too far backwards of previously ACKed data. There really aren't * any security implications of bad SACKing unless the target stack * doesn't validate the option length correctly. Someone trying to * spoof into a TCP connection won't bother blindly sending SACK * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { if (pf_modulate_sack(m, off, pd, th, dst)) *copyback = 1; } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ (ackskew <= (MAXACKWINDOW << sws)) && /* Acking not more than one window forward */ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || (pd->flags & PFDESC_IP_REAS) == 0)) { /* Require an exact/+1 sequence match on resets when possible */ if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* update states */ if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) pf_set_protostate(*state, psrc, TCPS_SYN_SENT); if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) pf_set_protostate(*state, psrc, TCPS_CLOSING); if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { pf_set_protostate(*state, pdst, TCPS_ESTABLISHED); if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) pf_set_protostate(*state, pdst, TCPS_FIN_WAIT_2); } if (th->th_flags & TH_RST) pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT); /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; /* Fall through to PASS packet */ } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: loose state match: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); } if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) pf_set_protostate(*state, psrc, TCPS_CLOSING); if (th->th_flags & TH_RST) pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT); /* Fall through to PASS packet */ } else { if ((*state)->dst.state == TCPS_SYN_SENT && (*state)->src.state == TCPS_SYN_SENT) { /* Send RST for state mismatches during handshake */ if (!(th->th_flags & TH_RST)) pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), 0, TH_RST, 0, 0, (*state)->rule.ptr->return_ttl, true, 0, 0, (*state)->act.rtableid); src->seqlo = 0; src->seqhi = 1; src->max_win = 1; } else if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD state: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_sloppy(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason) { struct tcphdr *th = &pd->hdr.tcp; struct pf_state_peer *src, *dst; u_int8_t psrc, pdst; if (pd->dir == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; psrc = PF_PEER_SRC; pdst = PF_PEER_DST; } else { src = &(*state)->dst; dst = &(*state)->src; psrc = PF_PEER_DST; pdst = PF_PEER_SRC; } if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) pf_set_protostate(*state, psrc, TCPS_SYN_SENT); if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) pf_set_protostate(*state, psrc, TCPS_CLOSING); if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { pf_set_protostate(*state, pdst, TCPS_ESTABLISHED); if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) { pf_set_protostate(*state, pdst, TCPS_FIN_WAIT_2); } else if (src->state == TCPS_SYN_SENT && dst->state < TCPS_SYN_SENT) { /* * Handle a special sloppy case where we only see one * half of the connection. If there is a ACK after * the initial SYN without ever seeing a packet from * the destination, set the connection to established. */ pf_set_protostate(*state, PF_PEER_BOTH, TCPS_ESTABLISHED); dst->state = src->state = TCPS_ESTABLISHED; if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (src->state == TCPS_CLOSING && dst->state == TCPS_ESTABLISHED && dst->seqlo == 0) { /* * Handle the closing of half connections where we * don't see the full bidirectional FIN/ACK+ACK * handshake. */ pf_set_protostate(*state, pdst, TCPS_CLOSING); } } if (th->th_flags & TH_RST) pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT); /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; return (PF_PASS); } static int pf_synproxy(struct pf_pdesc *pd, struct pf_kstate **state, u_short *reason) { struct pf_state_key *sk = (*state)->key[pd->didx]; struct tcphdr *th = &pd->hdr.tcp; if ((*state)->src.state == PF_TCPS_PROXY_SRC) { if (pd->dir != (*state)->direction) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } if (th->th_flags & TH_SYN) { if (ntohl(th->th_seq) != (*state)->src.seqlo) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, (*state)->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, true, 0, 0, (*state)->act.rtableid); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else pf_set_protostate(*state, PF_PEER_SRC, PF_TCPS_PROXY_DST); } if ((*state)->src.state == PF_TCPS_PROXY_DST) { if (pd->dir == (*state)->direction) { if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } (*state)->src.max_win = MAX(ntohs(th->th_win), 1); if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(arc4random()); pf_send_tcp((*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, false, (*state)->tag, 0, (*state)->act.rtableid); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (((th->th_flags & (TH_SYN|TH_ACK)) != (TH_SYN|TH_ACK)) || (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else { (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); (*state)->dst.seqlo = ntohl(th->th_seq); pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ntohl(th->th_seq) + 1, TH_ACK, (*state)->src.max_win, 0, 0, false, (*state)->tag, 0, (*state)->act.rtableid); pf_send_tcp((*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, true, 0, 0, (*state)->act.rtableid); (*state)->src.seqdiff = (*state)->dst.seqhi - (*state)->src.seqlo; (*state)->dst.seqdiff = (*state)->src.seqhi - (*state)->dst.seqlo; (*state)->src.seqhi = (*state)->src.seqlo + (*state)->dst.max_win; (*state)->dst.seqhi = (*state)->dst.seqlo + (*state)->src.max_win; (*state)->src.wscale = (*state)->dst.wscale = 0; pf_set_protostate(*state, PF_PEER_BOTH, TCPS_ESTABLISHED); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } return (PF_PASS); } static int pf_test_state_tcp(struct pf_kstate **state, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_state_key_cmp key; struct tcphdr *th = &pd->hdr.tcp; int copyback = 0; int action; struct pf_state_peer *src, *dst; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_TCP; if (pd->dir == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = th->th_sport; key.port[1] = th->th_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = th->th_sport; key.port[0] = th->th_dport; } STATE_LOOKUP(kif, &key, *state, pd); if (pd->dir == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } if ((action = pf_synproxy(pd, state, reason)) != PF_PASS) return (action); if (dst->state >= TCPS_FIN_WAIT_2 && src->state >= TCPS_FIN_WAIT_2 && (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) || ((th->th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_ACK && pf_syncookie_check(pd) && pd->dir == PF_IN))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state reuse "); pf_print_state(*state); pf_print_flags(th->th_flags); printf("\n"); } /* XXX make sure it's the same direction ?? */ pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED); pf_unlink_state(*state); *state = NULL; return (PF_DROP); } if ((*state)->state_flags & PFSTATE_SLOPPY) { if (pf_tcp_track_sloppy(state, pd, reason) == PF_DROP) return (PF_DROP); } else { if (pf_tcp_track_full(state, kif, m, off, pd, reason, ©back) == PF_DROP) return (PF_DROP); } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != th->th_sport) pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != th->th_dport) pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, pd->af); copyback = 1; } /* Copyback sequence modulation or stateful scrub changes if needed */ if (copyback) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); } static int pf_test_state_udp(struct pf_kstate **state, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; struct udphdr *uh = &pd->hdr.udp; uint8_t psrc, pdst; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_UDP; if (pd->dir == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = uh->uh_sport; key.port[1] = uh->uh_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = uh->uh_sport; key.port[0] = uh->uh_dport; } STATE_LOOKUP(kif, &key, *state, pd); if (pd->dir == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; psrc = PF_PEER_SRC; pdst = PF_PEER_DST; } else { src = &(*state)->dst; dst = &(*state)->src; psrc = PF_PEER_DST; pdst = PF_PEER_SRC; } /* update states */ if (src->state < PFUDPS_SINGLE) pf_set_protostate(*state, psrc, PFUDPS_SINGLE); if (dst->state == PFUDPS_SINGLE) pf_set_protostate(*state, pdst, PFUDPS_MULTIPLE); /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) (*state)->timeout = PFTM_UDP_MULTIPLE; else (*state)->timeout = PFTM_UDP_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != uh->uh_sport) pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != uh->uh_dport) pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); } return (PF_PASS); } +static int +pf_test_state_sctp(struct pf_kstate **state, struct pfi_kkif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) +{ + struct pf_state_key_cmp key; + struct pf_state_peer *src; //, *dst; + struct sctphdr *sh = &pd->hdr.sctp; + u_int8_t psrc; //, pdst; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = IPPROTO_SCTP; + if (pd->dir == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = sh->src_port; + key.port[1] = sh->dest_port; + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = sh->src_port; + key.port[0] = sh->dest_port; + } + + STATE_LOOKUP(kif, &key, *state, pd); + + if (pd->dir == (*state)->direction) { + src = &(*state)->src; + psrc = PF_PEER_SRC; + } else { + src = &(*state)->dst; + psrc = PF_PEER_DST; + } + + /* Track state. */ + if (pd->sctp_flags & PFDESC_SCTP_INIT) { + if (src->state < SCTP_COOKIE_WAIT) { + pf_set_protostate(*state, psrc, SCTP_COOKIE_WAIT); + (*state)->timeout = PFTM_TCP_OPENING; + } + } + if (pd->sctp_flags & PFDESC_SCTP_COOKIE) { + if (src->state < SCTP_ESTABLISHED) { + pf_set_protostate(*state, psrc, SCTP_ESTABLISHED); + (*state)->timeout = PFTM_TCP_ESTABLISHED; + } + } + if (pd->sctp_flags & (PFDESC_SCTP_SHUTDOWN | PFDESC_SCTP_ABORT | + PFDESC_SCTP_SHUTDOWN_COMPLETE)) { + if (src->state < SCTP_SHUTDOWN_PENDING) { + pf_set_protostate(*state, psrc, SCTP_SHUTDOWN_PENDING); + (*state)->timeout = PFTM_TCP_CLOSING; + } + } + + (*state)->expire = time_uptime; + + return (PF_PASS); +} + static int pf_test_state_icmp(struct pf_kstate **state, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0, *icmpsum; u_int8_t icmptype, icmpcode; int state_icmp = 0; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp.icmp_type; icmpcode = pd->hdr.icmp.icmp_code; icmpid = pd->hdr.icmp.icmp_id; icmpsum = &pd->hdr.icmp.icmp_cksum; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6.icmp6_type; icmpcode = pd->hdr.icmp6.icmp6_code; icmpid = pd->hdr.icmp6.icmp6_id; icmpsum = &pd->hdr.icmp6.icmp6_cksum; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } if (!state_icmp) { /* * ICMP query/reply message not related to a TCP/UDP packet. * Search for an ICMP state. */ key.af = pd->af; key.proto = pd->proto; key.port[0] = key.port[1] = icmpid; if (pd->dir == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); } STATE_LOOKUP(kif, &key, *state, pd); (*state)->expire = time_uptime; (*state)->timeout = PFTM_ICMP_ERROR_REPLY; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[0] != pd->hdr.icmp.icmp_id) { pd->hdr.icmp.icmp_cksum = pf_cksum_fixup( pd->hdr.icmp.icmp_cksum, icmpid, nk->port[pd->sidx], 0); pd->hdr.icmp.icmp_id = nk->port[pd->sidx]; } m_copyback(m, off, ICMP_MINLEN, (caddr_t )&pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->didx], 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )&pd->hdr.icmp6); break; #endif /* INET6 */ } } return (PF_PASS); } else { /* * ICMP error message in response to a TCP/UDP packet. * Extract the inner TCP/UDP header and search for that state. */ struct pf_pdesc pd2; bzero(&pd2, sizeof pd2); #ifdef INET struct ip h2; #endif /* INET */ #ifdef INET6 struct ip6_hdr h2_6; int terminal = 0; #endif /* INET6 */ int ipoff2 = 0; int off2 = 0; pd2.af = pd->af; /* Payload packet is from the opposite direction. */ pd2.sidx = (pd->dir == PF_IN) ? 1 : 0; pd2.didx = (pd->dir == PF_IN) ? 0 : 1; switch (pd->af) { #ifdef INET case AF_INET: /* offset of h2 in mbuf chain */ ipoff2 = off + ICMP_MINLEN; if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip)\n")); return (PF_DROP); } /* * ICMP error messages don't refer to non-first * fragments */ if (h2.ip_off & htons(IP_OFFMASK)) { REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* offset of protocol header that follows h2 */ off2 = ipoff2 + (h2.ip_hl << 2); pd2.proto = h2.ip_p; pd2.src = (struct pf_addr *)&h2.ip_src; pd2.dst = (struct pf_addr *)&h2.ip_dst; pd2.ip_sum = &h2.ip_sum; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipoff2 = off + sizeof(struct icmp6_hdr); if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip6)\n")); return (PF_DROP); } pd2.proto = h2_6.ip6_nxt; pd2.src = (struct pf_addr *)&h2_6.ip6_src; pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; pd2.ip_sum = NULL; off2 = ipoff2 + sizeof(h2_6); do { switch (pd2.proto) { case IPPROTO_FRAGMENT: /* * ICMPv6 error messages for * non-first fragments */ REASON_SET(reason, PFRES_FRAG); return (PF_DROP); case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off2, &opt6, sizeof(opt6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMPv6 short opt\n")); return (PF_DROP); } if (pd2.proto == IPPROTO_AH) off2 += (opt6.ip6e_len + 2) * 4; else off2 += (opt6.ip6e_len + 1) * 8; pd2.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); break; #endif /* INET6 */ } if (PF_ANEQ(pd->dst, pd2.src, pd->af)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d outer dst: ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" inner src: "); pf_print_host(pd2.src, 0, pd2.af); printf(" -> "); pf_print_host(pd2.dst, 0, pd2.af); printf("\n"); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; u_int32_t seq; struct pf_state_peer *src, *dst; u_int8_t dws; int copyback = 0; /* * Only the first 8 bytes of the TCP header can be * expected. Don't access any TCP header fields after * th_seq, an ackskew test is not possible. */ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(tcp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_TCP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = th.th_sport; key.port[pd2.didx] = th.th_dport; STATE_LOOKUP(kif, &key, *state, pd); if (pd->dir == (*state)->direction) { src = &(*state)->dst; dst = &(*state)->src; } else { src = &(*state)->src; dst = &(*state)->dst; } if (src->wscale && dst->wscale) dws = dst->wscale & PF_WSCALE_MASK; else dws = 0; /* Demodulate sequence number */ seq = ntohl(th.th_seq) - src->seqdiff; if (src->seqdiff) { pf_change_a(&th.th_seq, icmpsum, htonl(seq), 0); copyback = 1; } if (!((*state)->state_flags & PFSTATE_SLOPPY) && (!SEQ_GEQ(src->seqhi, seq) || !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: OK ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != th.th_sport) pf_change_icmp(pd2.src, &th.th_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != th.th_dport) pf_change_icmp(pd2.dst, &th.th_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); copyback = 1; } if (copyback) { switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )&pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t )&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )&pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, 8, (caddr_t)&th); } return (PF_PASS); break; } case IPPROTO_UDP: { struct udphdr uh; if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(udp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_UDP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = uh.uh_sport; key.port[pd2.didx] = uh.uh_dport; STATE_LOOKUP(kif, &key, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != uh.uh_sport) pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != uh.uh_dport) pf_change_icmp(pd2.dst, &uh.uh_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )&pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )&pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); } return (PF_PASS); break; } #ifdef INET case IPPROTO_ICMP: { struct icmp iih; if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short i" "(icmp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp_id; STATE_LOOKUP(kif, &key, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp_id) pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp_id) pf_change_icmp(pd2.dst, &iih.icmp_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: { struct icmp6_hdr iih; if (!pf_pull_hdr(m, off2, &iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(icmp6)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMPV6; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp6_id; STATE_LOOKUP(kif, &key, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp6_id) pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp6_id) pf_change_icmp(pd2.dst, &iih.icmp6_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)&pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); m_copyback(m, off2, sizeof(struct icmp6_hdr), (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET6 */ default: { key.af = pd2.af; key.proto = pd2.proto; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = 0; STATE_LOOKUP(kif, &key, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af)) pf_change_icmp(pd2.src, NULL, daddr, &nk->addr[pd2.sidx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af)) pf_change_icmp(pd2.dst, NULL, saddr, &nk->addr[pd2.didx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )&pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } } return (PF_PASS); break; } } } } static int pf_test_state_other(struct pf_kstate **state, struct pfi_kkif *kif, struct mbuf *m, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; uint8_t psrc, pdst; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = pd->proto; if (pd->dir == PF_IN) { PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = key.port[1] = 0; } else { PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = key.port[0] = 0; } STATE_LOOKUP(kif, &key, *state, pd); if (pd->dir == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; psrc = PF_PEER_SRC; pdst = PF_PEER_DST; } else { src = &(*state)->dst; dst = &(*state)->src; psrc = PF_PEER_DST; pdst = PF_PEER_SRC; } /* update states */ if (src->state < PFOTHERS_SINGLE) pf_set_protostate(*state, psrc, PFOTHERS_SINGLE); if (dst->state == PFOTHERS_SINGLE) pf_set_protostate(*state, pdst, PFOTHERS_MULTIPLE); /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) (*state)->timeout = PFTM_OTHER_MULTIPLE; else (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; KASSERT(nk, ("%s: nk is null", __func__)); KASSERT(pd, ("%s: pd is null", __func__)); KASSERT(pd->src, ("%s: pd->src is null", __func__)); KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); #endif /* INET6 */ } } return (PF_PASS); } /* * ipoff and off are measured from the start of the mbuf chain. * h must be at "ipoff" on the mbuf chain. */ void * pf_pull_hdr(struct mbuf *m, int off, void *p, int len, u_short *actionp, u_short *reasonp, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; if (fragoff) { if (fragoff >= len) ACTION_SET(actionp, PF_PASS); else { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_FRAG); } return (NULL); } if (m->m_pkthdr.len < off + len || ntohs(h->ip_len) < off + len) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (m->m_pkthdr.len < off + len || (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < (unsigned)(off + len)) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET6 */ } m_copydata(m, off, len, p); return (p); } int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *kif, int rtableid) { struct ifnet *ifp; /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. */ if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6)) return (1); if (af != AF_INET && af != AF_INET6) return (0); /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) return (1); ifp = (kif != NULL) ? kif->pfik_ifp : NULL; switch (af) { #ifdef INET6 case AF_INET6: return (fib6_check_urpf(rtableid, &addr->v6, 0, NHR_NONE, ifp)); #endif #ifdef INET case AF_INET: return (fib4_check_urpf(rtableid, addr->v4, 0, NHR_NONE, ifp)); #endif } return (0); } #ifdef INET static void pf_route(struct mbuf **m, struct pf_krule *r, struct ifnet *oifp, struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0, *m1, *md; struct sockaddr_in dst; struct ip *ip; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_ksrc_node *sn = NULL; int error = 0; uint16_t ip_len, ip_off; int r_rt, r_dir; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); if (s) { r_rt = s->rt; r_dir = s->direction; } else { r_rt = r->rt; r_dir = r->direction; } KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT || r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r_rt == PF_DUPTO) { if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) { if (s == NULL) { ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; /* If pfsync'd */ if (ifp == NULL) ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == oifp) { /* When the 2nd interface is not skipped */ return; } else { m0 = *m; *m = NULL; goto bad; } } else { pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED; if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) { if (s) PF_STATE_UNLOCK(s); return; } } } else { if ((r_rt == PF_REPLYTO) == (r_dir == pd->dir)) { pf_dummynet(pd, s, r, m); if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip = mtod(m0, struct ip *); bzero(&dst, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); dst.sin_addr = ip->ip_dst; bzero(&naddr, sizeof(naddr)); if (s == NULL) { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET)) dst.sin_addr.s_addr = naddr.v4.s_addr; ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET)) dst.sin_addr.s_addr = s->rt_addr.v4.s_addr; ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } /* If pfsync'd */ if (ifp == NULL) ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; if (ifp == NULL) goto bad; if (pd->dir == PF_IN) { if (pf_test(PF_OUT, 0, ifp, &m0, inp, &pd->act) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); goto bad; } ip = mtod(m0, struct ip *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* Copied from FreeBSD 10.0-CURRENT ip_output. */ m0->m_pkthdr.csum_flags |= CSUM_IP; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { pf_sctp_checksum(m0, (uint32_t)(ip->ip_hl << 2)); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= ifp->if_mtu || (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } m_clrprotoflags(m0); /* Avoid confusing lower layers. */ md = m0; error = pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md); if (md != NULL) error = (*ifp->if_output)(ifp, md, sintosa(&dst), NULL); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; KMOD_IPSTAT_INC(ips_cantfrag); if (r_rt != PF_DUPTO) { if (s && pd->nat_rule != NULL) PACKET_UNDO_NAT(m0, pd, (ip->ip_hl << 2) + (ip_off & IP_OFFMASK), s); icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); goto done; } else goto bad; } error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist); if (error) goto bad; for (; m0; m0 = m1) { m1 = m0->m_nextpkt; m0->m_nextpkt = NULL; if (error == 0) { m_clrprotoflags(m0); md = m0; error = pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md); if (md != NULL) error = (*ifp->if_output)(ifp, md, sintosa(&dst), NULL); } else m_freem(m0); } if (error == 0) KMOD_IPSTAT_INC(ips_fragmented); done: if (r_rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET */ #ifdef INET6 static void pf_route6(struct mbuf **m, struct pf_krule *r, struct ifnet *oifp, struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0, *md; struct sockaddr_in6 dst; struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_ksrc_node *sn = NULL; int r_rt, r_dir; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); if (s) { r_rt = s->rt; r_dir = s->direction; } else { r_rt = r->rt; r_dir = r->direction; } KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT || r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r_rt == PF_DUPTO) { if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) { if (s == NULL) { ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; /* If pfsync'd */ if (ifp == NULL) ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == oifp) { /* When the 2nd interface is not skipped */ return; } else { m0 = *m; *m = NULL; goto bad; } } else { pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED; if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) { if (s) PF_STATE_UNLOCK(s); return; } } } else { if ((r_rt == PF_REPLYTO) == (r_dir == pd->dir)) { pf_dummynet(pd, s, r, m); if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip6 = mtod(m0, struct ip6_hdr *); bzero(&dst, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(dst); dst.sin6_addr = ip6->ip6_dst; bzero(&naddr, sizeof(naddr)); if (s == NULL) { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &naddr, AF_INET6); ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &s->rt_addr, AF_INET6); ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (s) PF_STATE_UNLOCK(s); /* If pfsync'd */ if (ifp == NULL) ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; if (ifp == NULL) goto bad; if (pd->dir == PF_IN) { if (pf_test6(PF_OUT, 0, ifp, &m0, inp, &pd->act) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", __func__)); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & ~ifp->if_hwassist) { uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6); in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr)); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } /* * If the packet is too large for the outgoing interface, * send back an icmp6 error. */ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { md = m0; pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md); if (md != NULL) nd6_output_ifp(ifp, ifp, md, &dst, NULL); } else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r_rt != PF_DUPTO) { if (s && pd->nat_rule != NULL) PACKET_UNDO_NAT(m0, pd, ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr), s); icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); } else goto bad; } done: if (r_rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET6 */ /* * FreeBSD supports cksum offloads for the following drivers. * em(4), fxp(4), lge(4), nge(4), re(4), ti(4), txp(4), xl(4) * * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : * network driver performed cksum including pseudo header, need to verify * csum_data * CSUM_DATA_VALID : * network driver performed cksum, needs to additional pseudo header * cksum computation with partial csum_data(i.e. lack of H/W support for * pseudo header, for instance sk(4) and possibly gem(4)) * * After validating the cksum of packet, set both flag CSUM_DATA_VALID and * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper * TCP/UDP layer. * Also, set csum_data to 0xffff to force cksum validation. */ static int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t sum = 0; int hw_assist = 0; struct ip *ip; if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (p) { case IPPROTO_TCP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_TCP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_UDP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ break; default: return (1); } if (!hw_assist) { switch (af) { case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } } if (sum) { switch (p) { case IPPROTO_TCP: { KMOD_TCPSTAT_INC(tcps_rcvbadsum); break; } case IPPROTO_UDP: { KMOD_UDPSTAT_INC(udps_badsum); break; } #ifdef INET case IPPROTO_ICMP: { KMOD_ICMPSTAT_INC(icps_checksum); break; } #endif #ifdef INET6 case IPPROTO_ICMPV6: { KMOD_ICMP6STAT_INC(icp6s_checksum); break; } #endif /* INET6 */ } return (1); } else { if (p == IPPROTO_TCP || p == IPPROTO_UDP) { m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } return (0); } static bool pf_pdesc_to_dnflow(const struct pf_pdesc *pd, const struct pf_krule *r, const struct pf_kstate *s, struct ip_fw_args *dnflow) { int dndir = r->direction; if (s && dndir == PF_INOUT) { dndir = s->direction; } else if (dndir == PF_INOUT) { /* Assume primary direction. Happens when we've set dnpipe in * the ethernet level code. */ dndir = pd->dir; } memset(dnflow, 0, sizeof(*dnflow)); if (pd->dport != NULL) dnflow->f_id.dst_port = ntohs(*pd->dport); if (pd->sport != NULL) dnflow->f_id.src_port = ntohs(*pd->sport); if (pd->dir == PF_IN) dnflow->flags |= IPFW_ARGS_IN; else dnflow->flags |= IPFW_ARGS_OUT; if (pd->dir != dndir && pd->act.dnrpipe) { dnflow->rule.info = pd->act.dnrpipe; } else if (pd->dir == dndir && pd->act.dnpipe) { dnflow->rule.info = pd->act.dnpipe; } else { return (false); } dnflow->rule.info |= IPFW_IS_DUMMYNET; if (r->free_flags & PFRULE_DN_IS_PIPE || pd->act.flags & PFSTATE_DN_IS_PIPE) dnflow->rule.info |= IPFW_IS_PIPE; dnflow->f_id.proto = pd->proto; dnflow->f_id.extra = dnflow->rule.info; switch (pd->af) { case AF_INET: dnflow->f_id.addr_type = 4; dnflow->f_id.src_ip = ntohl(pd->src->v4.s_addr); dnflow->f_id.dst_ip = ntohl(pd->dst->v4.s_addr); break; case AF_INET6: dnflow->flags |= IPFW_ARGS_IP6; dnflow->f_id.addr_type = 6; dnflow->f_id.src_ip6 = pd->src->v6; dnflow->f_id.dst_ip6 = pd->dst->v6; break; default: panic("Invalid AF"); break; } return (true); } int pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kkif *kif; struct mbuf *m = *m0; M_ASSERTPKTHDR(m); MPASS(ifp->if_vnet == curvnet); NET_EPOCH_ASSERT(); if (!V_pf_status.running) return (PF_PASS); kif = (struct pfi_kkif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); /* Stateless! */ return (pf_test_eth_rule(dir, kif, m0)); } static int pf_dummynet(struct pf_pdesc *pd, struct pf_kstate *s, struct pf_krule *r, struct mbuf **m0) { return (pf_dummynet_route(pd, s, r, NULL, NULL, m0)); } static int pf_dummynet_route(struct pf_pdesc *pd, struct pf_kstate *s, struct pf_krule *r, struct ifnet *ifp, struct sockaddr *sa, struct mbuf **m0) { NET_EPOCH_ASSERT(); if (pd->act.dnpipe || pd->act.dnrpipe) { struct ip_fw_args dnflow; if (ip_dn_io_ptr == NULL) { m_freem(*m0); *m0 = NULL; return (ENOMEM); } if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m0)) == NULL)) { m_freem(*m0); *m0 = NULL; return (ENOMEM); } if (ifp != NULL) { pd->pf_mtag->flags |= PF_MTAG_FLAG_ROUTE_TO; pd->pf_mtag->if_index = ifp->if_index; pd->pf_mtag->if_idxgen = ifp->if_idxgen; MPASS(sa != NULL); if (pd->af == AF_INET) memcpy(&pd->pf_mtag->dst, sa, sizeof(struct sockaddr_in)); else memcpy(&pd->pf_mtag->dst, sa, sizeof(struct sockaddr_in6)); } if (pf_pdesc_to_dnflow(pd, r, s, &dnflow)) { pd->pf_mtag->flags |= PF_MTAG_FLAG_DUMMYNET; ip_dn_io_ptr(m0, &dnflow); if (*m0 != NULL) { pd->pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO; pd->pf_mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET; } } } return (0); } #ifdef INET int pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp, struct pf_rule_actions *default_actions) { struct pfi_kkif *kif; u_short action, reason = 0; struct mbuf *m = *m0; struct ip *h = NULL; struct m_tag *ipfwtag; struct pf_krule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_kstate *s = NULL; struct pf_kruleset *ruleset = NULL; struct pf_pdesc pd; int off, dirndx, use_2nd_queue = 0; uint16_t tag; uint8_t rt; PF_RULES_RLOCK_TRACKER; KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir)); M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); PF_RULES_RLOCK(); kif = (struct pfi_kkif *)ifp->if_pf_kif; if (__predict_false(kif == NULL)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); PF_RULES_RUNLOCK(); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { PF_RULES_RUNLOCK(); return (PF_PASS); } if (m->m_flags & M_SKIP_FIREWALL) { PF_RULES_RUNLOCK(); return (PF_PASS); } memset(&pd, 0, sizeof(pd)); if (default_actions != NULL) memcpy(&pd.act, default_actions, sizeof(pd.act)); pd.pf_mtag = pf_find_mtag(m); if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_MTAG_FLAG_ROUTE_TO)) { pd.pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO; ifp = ifnet_byindexgen(pd.pf_mtag->if_index, pd.pf_mtag->if_idxgen); if (ifp == NULL || ifp->if_flags & IFF_DYING) { PF_RULES_RUNLOCK(); m_freem(*m0); *m0 = NULL; return (PF_PASS); } PF_RULES_RUNLOCK(); (ifp->if_output)(ifp, m, sintosa(&pd.pf_mtag->dst), NULL); *m0 = NULL; return (PF_PASS); } if (pd.pf_mtag && pd.pf_mtag->dnpipe) { pd.act.dnpipe = pd.pf_mtag->dnpipe; pd.act.flags = pd.pf_mtag->dnflags; } if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL && pd.pf_mtag->flags & PF_MTAG_FLAG_DUMMYNET) { /* Dummynet re-injects packets after they've * completed their delay. We've already * processed them, so pass unconditionally. */ /* But only once. We may see the packet multiple times (e.g. * PFIL_IN/PFIL_OUT). */ pd.pf_mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET; PF_RULES_RUNLOCK(); return (PF_PASS); } pd.sport = pd.dport = NULL; pd.proto_sum = NULL; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET; pd.act.rtableid = -1; if (__predict_false(ip_divert_ptr != NULL) && ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; goto done; } pd.pf_mtag->flags |= PF_MTAG_FLAG_PACKET_LOOPED; m_tag_delete(m, ipfwtag); } if (pd.pf_mtag && pd.pf_mtag->flags & PF_MTAG_FLAG_FASTFWD_OURS_PRESENT) { m->m_flags |= M_FASTFWD_OURS; pd.pf_mtag->flags &= ~PF_MTAG_FLAG_FASTFWD_OURS_PRESENT; } } else if (pf_normalize_ip(m0, kif, &reason, &pd) != PF_PASS) { /* We do IP header normalization and packet reassembly here */ action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip *); off = h->ip_hl << 2; if (off < (int)sizeof(struct ip)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); pd.act.log = PF_LOG_FORCE; goto done; } pd.src = (struct pf_addr *)&h->ip_src; pd.dst = (struct pf_addr *)&h->ip_dst; pd.ip_sum = &h->ip_sum; pd.proto = h->ip_p; pd.tos = h->ip_tos & ~IPTOS_ECN_MASK; pd.tot_len = ntohs(h->ip_len); /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { action = pf_test_fragment(&r, kif, m, h, &pd, &a, &ruleset); goto done; } switch (h->ip_p) { case IPPROTO_TCP: { if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp), &action, &reason, AF_INET)) { if (action != PF_PASS) pd.act.log = PF_LOG_FORCE; goto done; } pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2); pd.sport = &pd.hdr.tcp.th_sport; pd.dport = &pd.hdr.tcp.th_dport; /* Respond to SYN with a syncookie. */ if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_SYN && pd.dir == PF_IN && pf_synflood_check(&pd)) { pf_syncookie_send(m, off, &pd); action = PF_DROP; break; } if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0) use_2nd_queue = 1; action = pf_normalize_tcp(kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) { /* Validate remote SYN|ACK, re-create original SYN if * valid. */ if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_ACK && pf_syncookie_validate(&pd) && pd.dir == PF_IN) { struct mbuf *msyn; msyn = pf_syncookie_recreate_syn(h->ip_ttl, off,&pd); if (msyn == NULL) { action = PF_DROP; break; } action = pf_test(dir, pflags, ifp, &msyn, inp, &pd.act); m_freem(msyn); if (action == PF_PASS) { action = pf_test_state_tcp(&s, kif, m, off, h, &pd, &reason); if (action != PF_PASS || s == NULL) { action = PF_DROP; break; } s->src.seqhi = ntohl(pd.hdr.tcp.th_ack) - 1; s->src.seqlo = ntohl(pd.hdr.tcp.th_seq) - 1; pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_DST); action = pf_synproxy(&pd, &s, &reason); if (action != PF_PASS) break; } break; } else { action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); } } break; } case IPPROTO_UDP: { if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp), &action, &reason, AF_INET)) { if (action != PF_PASS) pd.act.log = PF_LOG_FORCE; goto done; } pd.sport = &pd.hdr.udp.uh_sport; pd.dport = &pd.hdr.udp.uh_dport; if (pd.hdr.udp.uh_dport == 0 || ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off || ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } + case IPPROTO_SCTP: { + if (!pf_pull_hdr(m, off, &pd.hdr.sctp, sizeof(pd.hdr.sctp), + &action, &reason, AF_INET)) { + if (action != PF_PASS) + pd.act.log |= PF_LOG_FORCE; + goto done; + } + pd.sport = &pd.hdr.sctp.src_port; + pd.dport = &pd.hdr.sctp.dest_port; + if (pd.hdr.sctp.src_port == 0 || pd.hdr.sctp.dest_port == 0) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_normalize_sctp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_sctp(&s, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (V_pfsync_update_state_ptr != NULL) + V_pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + } else { + action = pf_test_rule(&r, &s, kif, m, off, + &pd, &a, &ruleset, inp); + } + break; + } + case IPPROTO_ICMP: { if (!pf_pull_hdr(m, off, &pd.hdr.icmp, ICMP_MINLEN, &action, &reason, AF_INET)) { if (action != PF_PASS) pd.act.log = PF_LOG_FORCE; goto done; } action = pf_test_state_icmp(&s, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } #ifdef INET6 case IPPROTO_ICMPV6: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv4 packet with ICMPv6 payload\n")); goto done; } #endif default: action = pf_test_state_other(&s, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (action == PF_PASS && h->ip_hl > 5 && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with ip options\n")); } if (s) { memcpy(&pd.act, &s->act, sizeof(struct pf_rule_actions)); tag = s->tag; rt = s->rt; } else { tag = r->tag; rt = r->rt; } if (tag > 0 && pf_tag_packet(m, &pd, tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } pf_scrub_ip(&m, &pd); if (pd.proto == IPPROTO_TCP && pd.act.max_mss) pf_normalize_mss(m, off, &pd); if (pd.act.rtableid >= 0) M_SETFIB(m, pd.act.rtableid); if (pd.act.flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) use_2nd_queue = 1; if (vlan_set_pcp(m, pd.act.set_prio[use_2nd_queue])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && pd.act.qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (use_2nd_queue || (pd.tos & IPTOS_LOWDELAY)) pd.pf_mtag->qid = pd.act.pqid; else pd.pf_mtag->qid = pd.act.qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ /* * connections redirected to loopback should not match sockets * bound specifically to loopback due to security implications, * see tcp_input() and in_pcblookup_listen(). */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN_LOOPBACK(ntohl(pd.dst->v4.s_addr))) m->m_flags |= M_SKIP_FIREWALL; if (__predict_false(ip_divert_ptr != NULL) && action == PF_PASS && r->divert.port && !PACKET_LOOPED(&pd)) { ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (ipfwtag != NULL) { ((struct ipfw_rule_ref *)(ipfwtag+1))->info = ntohs(r->divert.port); ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; if (s) PF_STATE_UNLOCK(s); m_tag_prepend(m, ipfwtag); if (m->m_flags & M_FASTFWD_OURS) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate tag\n")); } else { pd.pf_mtag->flags |= PF_MTAG_FLAG_FASTFWD_OURS_PRESENT; m->m_flags &= ~M_FASTFWD_OURS; } } ip_divert_ptr(*m0, dir == PF_IN); *m0 = NULL; return (action); } else { /* XXX: ipfw has the same behaviour! */ action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate divert tag\n")); } } if (pd.act.log) { struct pf_krule *lr; struct pf_krule_item *ri; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; if (pd.act.log & PF_LOG_FORCE || lr->log & PF_LOG_ALL) PFLOG_PACKET(kif, m, AF_INET, reason, lr, a, ruleset, &pd, (s == NULL)); if (s) { SLIST_FOREACH(ri, &s->match_rules, entry) if (ri->r->log & PF_LOG_ALL) PFLOG_PACKET(kif, m, AF_INET, reason, ri->r, a, ruleset, &pd, 0); } } pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS], pd.tot_len); pf_counter_u64_add_protected(&kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS], 1); if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); pf_counter_u64_add_protected(&r->packets[dirndx], 1); pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len); pf_update_timestamp(r); if (a != NULL) { pf_counter_u64_add_protected(&a->packets[dirndx], 1); pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len); } if (s != NULL) { struct pf_krule_item *ri; if (s->nat_rule.ptr != NULL) { pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx], 1); pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx], pd.tot_len); } if (s->src_node != NULL) { counter_u64_add(s->src_node->packets[dirndx], 1); counter_u64_add(s->src_node->bytes[dirndx], pd.tot_len); } if (s->nat_src_node != NULL) { counter_u64_add(s->nat_src_node->packets[dirndx], 1); counter_u64_add(s->nat_src_node->bytes[dirndx], pd.tot_len); } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; SLIST_FOREACH(ri, &s->match_rules, entry) { pf_counter_u64_add_protected(&ri->r->packets[dirndx], 1); pf_counter_u64_add_protected(&ri->r->bytes[dirndx], pd.tot_len); } } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_OUT)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_IN)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } pf_counter_u64_critical_exit(); switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route() returns unlocked. */ if (rt) { pf_route(m0, r, kif->pfik_ifp, s, &pd, inp); return (action); } if (pf_dummynet(&pd, s, r, m0) != 0) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } break; } SDT_PROBE4(pf, ip, test, done, action, reason, r, s); if (s) PF_STATE_UNLOCK(s); return (action); } #endif /* INET */ #ifdef INET6 int pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp, struct pf_rule_actions *default_actions) { struct pfi_kkif *kif; u_short action, reason = 0; struct mbuf *m = *m0, *n = NULL; struct m_tag *mtag; struct ip6_hdr *h = NULL; struct pf_krule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_kstate *s = NULL; struct pf_kruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0, use_2nd_queue = 0; uint16_t tag; uint8_t rt; PF_RULES_RLOCK_TRACKER; KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir)); M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); PF_RULES_RLOCK(); kif = (struct pfi_kkif *)ifp->if_pf_kif; if (__predict_false(kif == NULL)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); PF_RULES_RUNLOCK(); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { PF_RULES_RUNLOCK(); return (PF_PASS); } if (m->m_flags & M_SKIP_FIREWALL) { PF_RULES_RUNLOCK(); return (PF_PASS); } memset(&pd, 0, sizeof(pd)); if (default_actions != NULL) memcpy(&pd.act, default_actions, sizeof(pd.act)); pd.pf_mtag = pf_find_mtag(m); if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_MTAG_FLAG_ROUTE_TO)) { pd.pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO; ifp = ifnet_byindexgen(pd.pf_mtag->if_index, pd.pf_mtag->if_idxgen); if (ifp == NULL || ifp->if_flags & IFF_DYING) { PF_RULES_RUNLOCK(); m_freem(*m0); *m0 = NULL; return (PF_PASS); } PF_RULES_RUNLOCK(); nd6_output_ifp(ifp, ifp, m, (struct sockaddr_in6 *)&pd.pf_mtag->dst, NULL); *m0 = NULL; return (PF_PASS); } if (pd.pf_mtag && pd.pf_mtag->dnpipe) { pd.act.dnpipe = pd.pf_mtag->dnpipe; pd.act.flags = pd.pf_mtag->dnflags; } if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL && pd.pf_mtag->flags & PF_MTAG_FLAG_DUMMYNET) { pd.pf_mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET; /* Dummynet re-injects packets after they've * completed their delay. We've already * processed them, so pass unconditionally. */ PF_RULES_RUNLOCK(); return (PF_PASS); } pd.sport = pd.dport = NULL; pd.ip_sum = NULL; pd.proto_sum = NULL; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET6; pd.act.rtableid = -1; /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(m0, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip6_hdr *); /* * we do not support jumbogram. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; pd.tos = IPV6_DSCP(h); pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); pd.proto = h->ip6_nxt; do { switch (pd.proto) { case IPPROTO_FRAGMENT: action = pf_test_fragment(&r, kif, m, h, &pd, &a, &ruleset); if (action == PF_DROP) REASON_SET(&reason, PFRES_FRAG); goto done; case IPPROTO_ROUTING: { struct ip6_rthdr rthdr; if (rh_cnt++) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 more than one rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); pd.act.log = PF_LOG_FORCE; goto done; } if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); pd.act.log = PF_LOG_FORCE; goto done; } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 rthdr0\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); pd.act.log = PF_LOG_FORCE; goto done; } /* FALLTHROUGH */ } case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); action = PF_DROP; pd.act.log = PF_LOG_FORCE; goto done; } if (pd.proto == IPPROTO_AH) off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); /* if there's no routing header, use unmodified mbuf for checksumming */ if (!n) n = m; switch (pd.proto) { case IPPROTO_TCP: { if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp), &action, &reason, AF_INET6)) { if (action != PF_PASS) pd.act.log |= PF_LOG_FORCE; goto done; } pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2); pd.sport = &pd.hdr.tcp.th_sport; pd.dport = &pd.hdr.tcp.th_dport; action = pf_normalize_tcp(kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp), &action, &reason, AF_INET6)) { if (action != PF_PASS) pd.act.log |= PF_LOG_FORCE; goto done; } pd.sport = &pd.hdr.udp.uh_sport; pd.dport = &pd.hdr.udp.uh_dport; if (pd.hdr.udp.uh_dport == 0 || ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off || ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } + case IPPROTO_SCTP: { + if (!pf_pull_hdr(m, off, &pd.hdr.sctp, sizeof(pd.hdr.sctp), + &action, &reason, AF_INET6)) { + if (action != PF_PASS) + pd.act.log |= PF_LOG_FORCE; + goto done; + } + pd.sport = &pd.hdr.sctp.src_port; + pd.dport = &pd.hdr.sctp.dest_port; + if (pd.hdr.sctp.src_port == 0 || pd.hdr.sctp.dest_port == 0) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_normalize_sctp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_sctp(&s, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (V_pfsync_update_state_ptr != NULL) + V_pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + } else { + action = pf_test_rule(&r, &s, kif, m, off, + &pd, &a, &ruleset, inp); + } + break; + } + case IPPROTO_ICMP: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv6 packet with ICMPv4 payload\n")); goto done; } case IPPROTO_ICMPV6: { if (!pf_pull_hdr(m, off, &pd.hdr.icmp6, sizeof(pd.hdr.icmp6), &action, &reason, AF_INET6)) { if (action != PF_PASS) pd.act.log |= PF_LOG_FORCE; goto done; } action = pf_test_state_icmp(&s, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } default: action = pf_test_state_other(&s, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; } else if (s == NULL) action = pf_test_rule(&r, &s, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (n != m) { m_freem(n); n = NULL; } /* handle dangerous IPv6 extension headers. */ if (action == PF_PASS && rh_cnt && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); pd.act.log = r->log; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with dangerous v6 headers\n")); } if (s) { memcpy(&pd.act, &s->act, sizeof(struct pf_rule_actions)); tag = s->tag; rt = s->rt; } else { tag = r->tag; rt = r->rt; } if (tag > 0 && pf_tag_packet(m, &pd, tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } pf_scrub_ip6(&m, &pd); if (pd.proto == IPPROTO_TCP && pd.act.max_mss) pf_normalize_mss(m, off, &pd); if (pd.act.rtableid >= 0) M_SETFIB(m, pd.act.rtableid); if (pd.act.flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) use_2nd_queue = 1; if (vlan_set_pcp(m, pd.act.set_prio[use_2nd_queue])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && pd.act.qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (pd.tos & IPTOS_LOWDELAY) pd.pf_mtag->qid = pd.act.pqid; else pd.pf_mtag->qid = pd.act.qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) m->m_flags |= M_SKIP_FIREWALL; /* XXX: Anybody working on it?! */ if (r->divert.port) printf("pf: divert(9) is not supported for IPv6\n"); if (pd.act.log) { struct pf_krule *lr; struct pf_krule_item *ri; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; if (pd.act.log & PF_LOG_FORCE || lr->log & PF_LOG_ALL) PFLOG_PACKET(kif, m, AF_INET6, reason, lr, a, ruleset, &pd, (s == NULL)); if (s) { SLIST_FOREACH(ri, &s->match_rules, entry) if (ri->r->log & PF_LOG_ALL) PFLOG_PACKET(kif, m, AF_INET6, reason, ri->r, a, ruleset, &pd, 0); } } pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS], pd.tot_len); pf_counter_u64_add_protected(&kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS], 1); if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); pf_counter_u64_add_protected(&r->packets[dirndx], 1); pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len); if (a != NULL) { pf_counter_u64_add_protected(&a->packets[dirndx], 1); pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len); } if (s != NULL) { if (s->nat_rule.ptr != NULL) { pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx], 1); pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx], pd.tot_len); } if (s->src_node != NULL) { counter_u64_add(s->src_node->packets[dirndx], 1); counter_u64_add(s->src_node->bytes[dirndx], pd.tot_len); } if (s->nat_src_node != NULL) { counter_u64_add(s->nat_src_node->packets[dirndx], 1); counter_u64_add(s->nat_src_node->bytes[dirndx], pd.tot_len); } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]->addr[0], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]->addr[1], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } pf_counter_u64_critical_exit(); switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route6() returns unlocked. */ if (rt) { pf_route6(m0, r, kif->pfik_ifp, s, &pd, inp); return (action); } if (pf_dummynet(&pd, s, r, m0) != 0) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } break; } if (s) PF_STATE_UNLOCK(s); /* If reassembled packet passed, create new fragments. */ if (action == PF_PASS && *m0 && dir == PF_OUT && (mtag = m_tag_find(m, PACKET_TAG_PF_REASSEMBLED, NULL)) != NULL) action = pf_refragment6(ifp, m0, mtag, pflags & PFIL_FWD); SDT_PROBE4(pf, ip, test6, done, action, reason, r, s); return (action); } #endif /* INET6 */ diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c index 5f6d9c1635cd..38312712a0ad 100644 --- a/sys/netpfil/pf/pf_norm.c +++ b/sys/netpfil/pf/pf_norm.c @@ -1,2080 +1,2260 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2001 Niels Provos * Copyright 2011-2018 Alexander Bluhm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include #ifdef INET6 #include #endif /* INET6 */ struct pf_frent { TAILQ_ENTRY(pf_frent) fr_next; struct mbuf *fe_m; uint16_t fe_hdrlen; /* ipv4 header length with ip options ipv6, extension, fragment header */ uint16_t fe_extoff; /* last extension header offset or 0 */ uint16_t fe_len; /* fragment length */ uint16_t fe_off; /* fragment offset */ uint16_t fe_mff; /* more fragment flag */ }; struct pf_fragment_cmp { struct pf_addr frc_src; struct pf_addr frc_dst; uint32_t frc_id; sa_family_t frc_af; uint8_t frc_proto; }; struct pf_fragment { struct pf_fragment_cmp fr_key; #define fr_src fr_key.frc_src #define fr_dst fr_key.frc_dst #define fr_id fr_key.frc_id #define fr_af fr_key.frc_af #define fr_proto fr_key.frc_proto /* pointers to queue element */ struct pf_frent *fr_firstoff[PF_FRAG_ENTRY_POINTS]; /* count entries between pointers */ uint8_t fr_entries[PF_FRAG_ENTRY_POINTS]; RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; uint32_t fr_timeout; uint16_t fr_maxlen; /* maximum length of single fragment */ u_int16_t fr_holes; /* number of holes in the queue */ TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; }; struct pf_fragment_tag { uint16_t ft_hdrlen; /* header length of reassembled pkt */ uint16_t ft_extoff; /* last extension header offset or 0 */ uint16_t ft_maxlen; /* maximum fragment payload length */ uint32_t ft_id; /* fragment id */ }; VNET_DEFINE_STATIC(struct mtx, pf_frag_mtx); #define V_pf_frag_mtx VNET(pf_frag_mtx) #define PF_FRAG_LOCK() mtx_lock(&V_pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_unlock(&V_pf_frag_mtx) #define PF_FRAG_ASSERT() mtx_assert(&V_pf_frag_mtx, MA_OWNED) VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); VNET_DEFINE_STATIC(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) RB_HEAD(pf_frag_tree, pf_fragment); VNET_DEFINE_STATIC(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static void pf_flush_fragments(void); static void pf_free_fragment(struct pf_fragment *); static void pf_remove_fragment(struct pf_fragment *); static struct pf_frent *pf_create_fragment(u_short *); static int pf_frent_holes(struct pf_frent *frent); static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree); static inline int pf_frent_index(struct pf_frent *); static int pf_frent_insert(struct pf_fragment *, struct pf_frent *, struct pf_frent *); void pf_frent_remove(struct pf_fragment *, struct pf_frent *); struct pf_frent *pf_frent_previous(struct pf_fragment *, struct pf_frent *); static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, struct pf_frent *, u_short *); static struct mbuf *pf_join_fragment(struct pf_fragment *); #ifdef INET static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *); #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **, struct ip6_hdr *, struct ip6_frag *, uint16_t, uint16_t, u_short *); #endif /* INET6 */ #define DPFPRINTF(x) do { \ if (V_pf_status.debug >= PF_DEBUG_MISC) { \ printf("%s: ", __func__); \ printf x ; \ } \ } while(0) #ifdef INET static void pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key) { key->frc_src.v4 = ip->ip_src; key->frc_dst.v4 = ip->ip_dst; key->frc_af = AF_INET; key->frc_proto = ip->ip_p; key->frc_id = ip->ip_id; } #endif /* INET */ void pf_normalize_init(void) { V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&V_pf_frag_mtx, "pf fragments", NULL, MTX_DEF); V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); TAILQ_INIT(&V_pf_fragqueue); } void pf_normalize_cleanup(void) { uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); uma_zdestroy(V_pf_frag_z); mtx_destroy(&V_pf_frag_mtx); } static int pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; if ((diff = a->fr_id - b->fr_id) != 0) return (diff); if ((diff = a->fr_proto - b->fr_proto) != 0) return (diff); if ((diff = a->fr_af - b->fr_af) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) return (diff); return (0); } void pf_purge_expired_fragments(void) { u_int32_t expire = time_uptime - V_pf_default_rule.timeout[PFTM_FRAG]; pf_purge_fragments(expire); } void pf_purge_fragments(uint32_t expire) { struct pf_fragment *frag; PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); } PF_FRAG_UNLOCK(); } /* * Try to flush old fragments to make space for new ones */ static void pf_flush_fragments(void) { struct pf_fragment *frag; int goal; PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; DPFPRINTF(("trying to free %d frag entriess\n", goal)); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) pf_free_fragment(frag); else break; } } /* Frees the fragments and all associated entries */ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; PF_FRAG_ASSERT(); /* Free all fragments */ for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = TAILQ_FIRST(&frag->fr_queue)) { TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); m_freem(frent->fe_m); uma_zfree(V_pf_frent_z, frent); } pf_remove_fragment(frag); } static struct pf_fragment * pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) { struct pf_fragment *frag; PF_FRAG_ASSERT(); frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = time_uptime; TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); } return (frag); } /* Removes a fragment from the fragment queue and frees the fragment */ static void pf_remove_fragment(struct pf_fragment *frag) { PF_FRAG_ASSERT(); KASSERT(frag, ("frag != NULL")); RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } static struct pf_frent * pf_create_fragment(u_short *reason) { struct pf_frent *frent; PF_FRAG_ASSERT(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { pf_flush_fragments(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { REASON_SET(reason, PFRES_MEMORY); return (NULL); } } return (frent); } /* * Calculate the additional holes that were created in the fragment * queue by inserting this fragment. A fragment in the middle * creates one more hole by splitting. For each connected side, * it loses one hole. * Fragment entry must be in the queue when calling this function. */ static int pf_frent_holes(struct pf_frent *frent) { struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int holes = 1; if (prev == NULL) { if (frent->fe_off == 0) holes--; } else { KASSERT(frent->fe_off != 0, ("frent->fe_off != 0")); if (frent->fe_off == prev->fe_off + prev->fe_len) holes--; } if (next == NULL) { if (!frent->fe_mff) holes--; } else { KASSERT(frent->fe_mff, ("frent->fe_mff")); if (next->fe_off == frent->fe_off + frent->fe_len) holes--; } return holes; } static inline int pf_frent_index(struct pf_frent *frent) { /* * We have an array of 16 entry points to the queue. A full size * 65535 octet IP packet can have 8192 fragments. So the queue * traversal length is at most 512 and at most 16 entry points are * checked. We need 128 additional bytes on a 64 bit architecture. */ CTASSERT(((u_int16_t)0xffff &~ 7) / (0x10000 / PF_FRAG_ENTRY_POINTS) == 16 - 1); CTASSERT(((u_int16_t)0xffff >> 3) / PF_FRAG_ENTRY_POINTS == 512 - 1); return frent->fe_off / (0x10000 / PF_FRAG_ENTRY_POINTS); } static int pf_frent_insert(struct pf_fragment *frag, struct pf_frent *frent, struct pf_frent *prev) { int index; CTASSERT(PF_FRAG_ENTRY_LIMIT <= 0xff); /* * A packet has at most 65536 octets. With 16 entry points, each one * spawns 4096 octets. We limit these to 64 fragments each, which * means on average every fragment must have at least 64 octets. */ index = pf_frent_index(frent); if (frag->fr_entries[index] >= PF_FRAG_ENTRY_LIMIT) return ENOBUFS; frag->fr_entries[index]++; if (prev == NULL) { TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); } else { KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next); } if (frag->fr_firstoff[index] == NULL) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { if (frent->fe_off < frag->fr_firstoff[index]->fe_off) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { KASSERT(prev != NULL, ("prev != NULL")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } } frag->fr_holes += pf_frent_holes(frent); return 0; } void pf_frent_remove(struct pf_fragment *frag, struct pf_frent *frent) { #ifdef INVARIANTS struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); #endif struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int index; frag->fr_holes -= pf_frent_holes(frent); index = pf_frent_index(frent); KASSERT(frag->fr_firstoff[index] != NULL, ("frent not found")); if (frag->fr_firstoff[index]->fe_off == frent->fe_off) { if (next == NULL) { frag->fr_firstoff[index] = NULL; } else { KASSERT(frent->fe_off + frent->fe_len <= next->fe_off, ("overlapping fragment")); if (pf_frent_index(next) == index) { frag->fr_firstoff[index] = next; } else { frag->fr_firstoff[index] = NULL; } } } else { KASSERT(frag->fr_firstoff[index]->fe_off < frent->fe_off, ("frag->fr_firstoff[index]->fe_off < frent->fe_off")); KASSERT(prev != NULL, ("prev != NULL")); KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); KASSERT(frag->fr_entries[index] > 0, ("No fragments remaining")); frag->fr_entries[index]--; } struct pf_frent * pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent) { struct pf_frent *prev, *next; int index; /* * If there are no fragments after frag, take the final one. Assume * that the global queue is not empty. */ prev = TAILQ_LAST(&frag->fr_queue, pf_fragq); KASSERT(prev != NULL, ("prev != NULL")); if (prev->fe_off <= frent->fe_off) return prev; /* * We want to find a fragment entry that is before frag, but still * close to it. Find the first fragment entry that is in the same * entry point or in the first entry point after that. As we have * already checked that there are entries behind frag, this will * succeed. */ for (index = pf_frent_index(frent); index < PF_FRAG_ENTRY_POINTS; index++) { prev = frag->fr_firstoff[index]; if (prev != NULL) break; } KASSERT(prev != NULL, ("prev != NULL")); /* * In prev we may have a fragment from the same entry point that is * before frent, or one that is just one position behind frent. * In the latter case, we go back one step and have the predecessor. * There may be none if the new fragment will be the first one. */ if (prev->fe_off > frent->fe_off) { prev = TAILQ_PREV(prev, pf_fragq, fr_next); if (prev == NULL) return NULL; KASSERT(prev->fe_off <= frent->fe_off, ("prev->fe_off <= frent->fe_off")); return prev; } /* * In prev is the first fragment of the entry point. The offset * of frag is behind it. Find the closest previous fragment. */ for (next = TAILQ_NEXT(prev, fr_next); next != NULL; next = TAILQ_NEXT(next, fr_next)) { if (next->fe_off > frent->fe_off) break; prev = next; } return prev; } static struct pf_fragment * pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; uint16_t total; int old_index, new_index; PF_FRAG_ASSERT(); /* No empty fragments. */ if (frent->fe_len == 0) { DPFPRINTF(("bad fragment: len 0\n")); goto bad_fragment; } /* All fragments are 8 byte aligned. */ if (frent->fe_mff && (frent->fe_len & 0x7)) { DPFPRINTF(("bad fragment: mff and len %d\n", frent->fe_len)); goto bad_fragment; } /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { DPFPRINTF(("bad fragment: max packet %d\n", frent->fe_off + frent->fe_len)); goto bad_fragment; } DPFPRINTF((key->frc_af == AF_INET ? "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n", key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); /* Fully buffer all of the fragments in this fragment queue. */ frag = pf_find_fragment(key, &V_pf_frag_tree); /* Create a new reassembly queue for this packet. */ if (frag == NULL) { frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { pf_flush_fragments(); frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { REASON_SET(reason, PFRES_MEMORY); goto drop_fragment; } } *(struct pf_fragment_cmp *)frag = *key; memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff)); memset(frag->fr_entries, 0, sizeof(frag->fr_entries)); frag->fr_timeout = time_uptime; frag->fr_maxlen = frent->fe_len; frag->fr_holes = 1; TAILQ_INIT(&frag->fr_queue); RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); /* We do not have a previous fragment, cannot fail. */ pf_frent_insert(frag, frent, NULL); return (frag); } KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); /* Remember maximum fragment len for refragmentation. */ if (frent->fe_len > frag->fr_maxlen) frag->fr_maxlen = frent->fe_len; /* Maximum data we have seen already. */ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; /* Non terminal fragments must have more fragments flag. */ if (frent->fe_off + frent->fe_len < total && !frent->fe_mff) goto bad_fragment; /* Check if we saw the last fragment already. */ if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) { if (frent->fe_off + frent->fe_len > total || (frent->fe_off + frent->fe_len == total && frent->fe_mff)) goto bad_fragment; } else { if (frent->fe_off + frent->fe_len == total && !frent->fe_mff) goto bad_fragment; } /* Find neighbors for newly inserted fragment */ prev = pf_frent_previous(frag, frent); if (prev == NULL) { after = TAILQ_FIRST(&frag->fr_queue); KASSERT(after != NULL, ("after != NULL")); } else { after = TAILQ_NEXT(prev, fr_next); } if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { uint16_t precut; precut = prev->fe_off + prev->fe_len - frent->fe_off; if (precut >= frent->fe_len) goto bad_fragment; DPFPRINTF(("overlap -%d\n", precut)); m_adj(frent->fe_m, precut); frent->fe_off += precut; frent->fe_len -= precut; } for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off; after = next) { uint16_t aftercut; aftercut = frent->fe_off + frent->fe_len - after->fe_off; DPFPRINTF(("adjust overlap %d\n", aftercut)); if (aftercut < after->fe_len) { m_adj(after->fe_m, aftercut); old_index = pf_frent_index(after); after->fe_off += aftercut; after->fe_len -= aftercut; new_index = pf_frent_index(after); if (old_index != new_index) { DPFPRINTF(("frag index %d, new %d", old_index, new_index)); /* Fragment switched queue as fe_off changed */ after->fe_off -= aftercut; after->fe_len += aftercut; /* Remove restored fragment from old queue */ pf_frent_remove(frag, after); after->fe_off += aftercut; after->fe_len -= aftercut; /* Insert into correct queue */ if (pf_frent_insert(frag, after, prev)) { DPFPRINTF( ("fragment requeue limit exceeded")); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); /* There is not way to recover */ goto bad_fragment; } } break; } /* This fragment is completely overlapped, lose it. */ next = TAILQ_NEXT(after, fr_next); pf_frent_remove(frag, after); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); } /* If part of the queue gets too long, there is not way to recover. */ if (pf_frent_insert(frag, frent, prev)) { DPFPRINTF(("fragment queue limit exceeded\n")); goto bad_fragment; } return (frag); bad_fragment: REASON_SET(reason, PFRES_FRAG); drop_fragment: uma_zfree(V_pf_frent_z, frent); return (NULL); } static struct mbuf * pf_join_fragment(struct pf_fragment *frag) { struct mbuf *m, *m2; struct pf_frent *frent, *next; frent = TAILQ_FIRST(&frag->fr_queue); next = TAILQ_NEXT(frent, fr_next); m = frent->fe_m; m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); for (frent = next; frent != NULL; frent = next) { next = TAILQ_NEXT(frent, fr_next); m2 = frent->fe_m; /* Strip off ip header. */ m_adj(m2, frent->fe_hdrlen); /* Strip off any trailing bytes. */ m_adj(m2, frent->fe_len - m2->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); m_cat(m, m2); } /* Remove from fragment queue. */ pf_remove_fragment(frag); return (m); } #ifdef INET static int pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; uint16_t total, hdrlen; /* Get an entry for the fragment queue */ if ((frent = pf_create_fragment(reason)) == NULL) return (PF_DROP); frent->fe_m = m; frent->fe_hdrlen = ip->ip_hl << 2; frent->fe_extoff = 0; frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2); frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; frent->fe_mff = ntohs(ip->ip_off) & IP_MF; pf_ip2key(ip, dir, &key); if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) return (PF_DROP); /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes)); return (PF_PASS); /* drop because *m0 is NULL, no error */ } /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen; m = *m0 = pf_join_fragment(frag); frag = NULL; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } ip = mtod(m, struct ip *); ip->ip_sum = pf_cksum_fixup(ip->ip_sum, ip->ip_len, htons(hdrlen + total), 0); ip->ip_len = htons(hdrlen + total); ip->ip_sum = pf_cksum_fixup(ip->ip_sum, ip->ip_off, ip->ip_off & ~(IP_MF|IP_OFFMASK), 0); ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { DPFPRINTF(("drop: too big: %d\n", total)); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test() */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); return (PF_PASS); } #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, uint16_t hdrlen, uint16_t extoff, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; struct m_tag *mtag; struct pf_fragment_tag *ftag; int off; uint32_t frag_id; uint16_t total, maxlen; uint8_t proto; PF_FRAG_LOCK(); /* Get an entry for the fragment queue. */ if ((frent = pf_create_fragment(reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } frent->fe_m = m; frent->fe_hdrlen = hdrlen; frent->fe_extoff = extoff; frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen; frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; key.frc_src.v6 = ip6->ip6_src; key.frc_dst.v6 = ip6->ip6_dst; key.frc_af = AF_INET6; /* Only the first fragment's protocol is relevant. */ key.frc_proto = 0; key.frc_id = fraghdr->ip6f_ident; if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } /* The mbuf is part of the fragment entry, no direct free or access. */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes)); PF_FRAG_UNLOCK(); return (PF_PASS); /* Drop because *m0 is NULL, no error. */ } /* We have all the data. */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); extoff = frent->fe_extoff; maxlen = frag->fr_maxlen; frag_id = frag->fr_id; total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); m = *m0 = pf_join_fragment(frag); frag = NULL; PF_FRAG_UNLOCK(); /* Take protocol from first fragment header. */ m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); proto = *(mtod(m, uint8_t *) + off); m = *m0; /* Delete frag6 header */ if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0) goto fail; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } if ((mtag = m_tag_get(PACKET_TAG_PF_REASSEMBLED, sizeof(struct pf_fragment_tag), M_NOWAIT)) == NULL) goto fail; ftag = (struct pf_fragment_tag *)(mtag + 1); ftag->ft_hdrlen = hdrlen; ftag->ft_extoff = extoff; ftag->ft_maxlen = maxlen; ftag->ft_id = frag_id; m_tag_prepend(m, mtag); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total); if (extoff) { /* Write protocol into next field of last extension header. */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); *(mtod(m, char *) + off) = proto; m = *m0; } else ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { DPFPRINTF(("drop: too big: %d\n", total)); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip6->ip6_plen))); return (PF_PASS); fail: REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */ return (PF_DROP); } #endif /* INET6 */ #ifdef INET6 int pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag, bool forward) { struct mbuf *m = *m0, *t; struct ip6_hdr *hdr; struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1); struct pf_pdesc pd; uint32_t frag_id; uint16_t hdrlen, extoff, maxlen; uint8_t proto; int error, action; hdrlen = ftag->ft_hdrlen; extoff = ftag->ft_extoff; maxlen = ftag->ft_maxlen; frag_id = ftag->ft_id; m_tag_delete(m, mtag); mtag = NULL; ftag = NULL; if (extoff) { int off; /* Use protocol from next field of last extension header */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT((m != NULL), ("pf_refragment6: short mbuf chain")); proto = *(mtod(m, uint8_t *) + off); *(mtod(m, char *) + off) = IPPROTO_FRAGMENT; m = *m0; } else { hdr = mtod(m, struct ip6_hdr *); proto = hdr->ip6_nxt; hdr->ip6_nxt = IPPROTO_FRAGMENT; } /* In case of link-local traffic we'll need a scope set. */ hdr = mtod(m, struct ip6_hdr *); in6_setscope(&hdr->ip6_src, ifp, NULL); in6_setscope(&hdr->ip6_dst, ifp, NULL); /* The MTU must be a multiple of 8 bytes, or we risk doing the * fragmentation wrong. */ maxlen = maxlen & ~7; /* * Maxlen may be less than 8 if there was only a single * fragment. As it was fragmented before, add a fragment * header also for a single fragment. If total or maxlen * is less than 8, ip6_fragment() will return EMSGSIZE and * we drop the packet. */ error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id); m = (*m0)->m_nextpkt; (*m0)->m_nextpkt = NULL; if (error == 0) { /* The first mbuf contains the unfragmented packet. */ m_freem(*m0); *m0 = NULL; action = PF_PASS; } else { /* Drop expects an mbuf to free. */ DPFPRINTF(("refragment error %d\n", error)); action = PF_DROP; } for (; m; m = t) { t = m->m_nextpkt; m->m_nextpkt = NULL; m->m_flags |= M_SKIP_FIREWALL; memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (error == 0) if (forward) { MPASS(m->m_pkthdr.rcvif != NULL); ip6_forward(m, 0); } else { (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } else m_freem(m); } return (action); } #endif /* INET6 */ #ifdef INET int pf_normalize_ip(struct mbuf **m0, struct pfi_kkif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_krule *r; struct ip *h = mtod(m, struct ip *); int mff = (ntohs(h->ip_off) & IP_MF); int hlen = h->ip_hl << 2; u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t max; int ip_len; int tag = -1; int verdict; int srs; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* Check if there any scrub rules. Lack of scrub rules means enforced * packet normalization operation just like in OpenBSD. */ srs = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != h->ip_p) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip_src.s_addr, AF_INET, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else break; } if (srs) { /* With scrub rules present IPv4 normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); } else if ((!V_pf_status.reass && (h->ip_off & htons(IP_MF | IP_OFFMASK)))) { /* With no scrub rules IPv4 fragment reassembly depends on the * global switch. Fragments can be dropped early if reassembly * is disabled. */ REASON_SET(reason, PFRES_NORM); goto drop; } /* Check for illegal packets */ if (hlen < (int)sizeof(struct ip)) { REASON_SET(reason, PFRES_NORM); goto drop; } if (hlen > ntohs(h->ip_len)) { REASON_SET(reason, PFRES_NORM); goto drop; } /* Clear IP_DF if the rule uses the no-df option or we're in no-df mode */ if ((((r && r->rule_flag & PFRULE_NODF) || (V_pf_status.reass & PF_REASS_NODF)) && h->ip_off & htons(IP_DF) )) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* We will need other tests here */ if (!fragoff && !mff) goto no_fragment; /* We're dealing with a fragment now. Don't allow fragments * with IP_DF to enter the cache. If the flag was cleared by * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { DPFPRINTF(("IP_DF\n")); goto bad; } ip_len = ntohs(h->ip_len) - hlen; /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { DPFPRINTF(("mff and %d\n", ip_len)); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { DPFPRINTF(("max packet %d\n", fragoff + ip_len)); goto bad; } if (r==NULL || !(r->rule_flag & PFRULE_FRAGMENT_NOREASS)) { max = fragoff + ip_len; /* Fully buffer all of the fragments * Might return a completely reassembled mbuf, or NULL */ PF_FRAG_LOCK(); DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); verdict = pf_reassemble(m0, h, pd->dir, reason); PF_FRAG_UNLOCK(); if (verdict != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); h = mtod(m, struct ip *); no_fragment: /* At this point, only IP_DF is allowed in ip_off */ if (h->ip_off & ~htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } } return (PF_PASS); bad: DPFPRINTF(("dropping bad fragment\n")); REASON_SET(reason, PFRES_FRAG); drop: if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif #ifdef INET6 int pf_normalize_ip6(struct mbuf **m0, struct pfi_kkif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_krule *r; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); int extoff; int off; struct ip6_ext ext; struct ip6_opt opt; struct ip6_frag frag; u_int32_t plen; int optend; int ooff; u_int8_t proto; int terminal; int srs; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* Check if there any scrub rules. Lack of scrub rules means enforced * packet normalization operation just like in OpenBSD. */ srs = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET6) r = r->skip[PF_SKIP_AF].ptr; #if 0 /* header chain! */ else if (r->proto && r->proto != h->ip6_nxt) r = r->skip[PF_SKIP_PROTO].ptr; #endif else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip6_src, AF_INET6, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip6_dst, AF_INET6, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else break; } if (srs) { /* With scrub rules present IPv6 normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); } /* Check for illegal packets */ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; plen = ntohs(h->ip6_plen); /* jumbo payload option not supported */ if (plen == 0) goto drop; extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; do { switch (proto) { case IPPROTO_FRAGMENT: goto fragment; break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else off += (ext.ip6e_len + 1) * 8; proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) goto shortpkt; if (opt.ip6o_type == IP6OPT_PAD1) { ooff++; continue; } if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; if (opt.ip6o_type == IP6OPT_JUMBO) goto drop; ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); off = optend; proto = ext.ip6e_nxt; break; default: terminal = 1; break; } } while (!terminal); if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; return (PF_PASS); fragment: if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) goto shortpkt; /* Offset now points to data portion. */ off += sizeof(frag); /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */ if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); pd->flags |= PFDESC_IP_REAS; return (PF_PASS); shortpkt: REASON_SET(reason, PFRES_SHORT); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif /* INET6 */ int pf_normalize_tcp(struct pfi_kkif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_krule *r, *rm = NULL; struct tcphdr *th = &pd->hdr.tcp; int rewrite = 0; u_short reason; u_int8_t flags; sa_family_t af = pd->af; int srs; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* Check if there any scrub rules. Lack of scrub rules means enforced * packet normalization operation just like in OpenBSD. */ srs = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { rm = r; break; } } if (srs) { /* With scrub rules present TCP normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(rm, &pd->act); } if (rm && rm->rule_flag & PFRULE_REASSEMBLE_TCP) pd->flags |= PFDESC_TCP_NORM; flags = th->th_flags; if (flags & TH_SYN) { /* Illegal packet */ if (flags & TH_RST) goto tcp_drop; if (flags & TH_FIN) goto tcp_drop; } else { /* Illegal packet */ if (!(flags & (TH_ACK|TH_RST))) goto tcp_drop; } if (!(flags & TH_ACK)) { /* These flags are only valid if ACK is set */ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) goto tcp_drop; } /* Check for illegal header length */ if (th->th_off < (sizeof(struct tcphdr) >> 2)) goto tcp_drop; /* If flags changed, or reserved data set, then adjust */ if (flags != th->th_flags || th->th_x2 != 0) { u_int16_t ov, nv; ov = *(u_int16_t *)(&th->th_ack + 1); th->th_flags = flags; th->th_x2 = 0; nv = *(u_int16_t *)(&th->th_ack + 1); th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0); rewrite = 1; } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp, 0, 0); th->th_urp = 0; rewrite = 1; } /* copy back packet headers if we sanitized */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); tcp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } int pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) { u_int32_t tsval, tsecr; u_int8_t hdr[60]; u_int8_t *opt; KASSERT((src->scrub == NULL), ("pf_normalize_tcp_init: src->scrub != NULL")); src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); switch (pd->af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); src->scrub->pfss_ttl = h->ip_ttl; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); src->scrub->pfss_ttl = h->ip6_hlim; break; } #endif /* INET6 */ } /* * All normalizations below are only begun if we see the start of * the connections. They must all set an enabled bit in pfss_flags */ if ((th->th_flags & TH_SYN) == 0) return (0); if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: if (opt[1] >= TCPOLEN_TIMESTAMP) { src->scrub->pfss_flags |= PFSS_TIMESTAMP; src->scrub->pfss_ts_mod = htonl(arc4random()); /* note PFSS_PAWS not set yet */ memcpy(&tsval, &opt[2], sizeof(u_int32_t)); memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); src->scrub->pfss_tsval0 = ntohl(tsval); src->scrub->pfss_tsval = ntohl(tsval); src->scrub->pfss_tsecr = ntohl(tsecr); getmicrouptime(&src->scrub->pfss_last); } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } } return (0); } void pf_normalize_tcp_cleanup(struct pf_kstate *state) { uma_zfree(V_pf_state_scrub_z, state->src.scrub); uma_zfree(V_pf_state_scrub_z, state->dst.scrub); /* Someday... flush the TCP segment reassembly descriptors. */ } int pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, struct tcphdr *th, struct pf_kstate *state, struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) { struct timeval uptime; u_int32_t tsval, tsecr; u_int tsval_from_last; u_int8_t hdr[60]; u_int8_t *opt; int copyback = 0; int got_ts = 0; size_t startoff; KASSERT((src->scrub || dst->scrub), ("%s: src->scrub && dst->scrub!", __func__)); /* * Enforce the minimum TTL seen for this connection. Negate a common * technique to evade an intrusion detection system and confuse * firewall state code. */ switch (pd->af) { #ifdef INET case AF_INET: { if (src->scrub) { struct ip *h = mtod(m, struct ip *); if (h->ip_ttl > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip_ttl; h->ip_ttl = src->scrub->pfss_ttl; } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { if (src->scrub) { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (h->ip6_hlim > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip6_hlim; h->ip6_hlim = src->scrub->pfss_ttl; } break; } #endif /* INET6 */ } if (th->th_off > (sizeof(struct tcphdr) >> 2) && ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { startoff = opt - (hdr + sizeof(struct tcphdr)); switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: /* Modulate the timestamps. Can be used for * NAT detection, OS uptime determination or * reboot detection. */ if (got_ts) { /* Huh? Multiple timestamps!? */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("multiple TS??\n")); pf_print_state(state); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } if (opt[1] >= TCPOLEN_TIMESTAMP) { memcpy(&tsval, &opt[2], sizeof(u_int32_t)); if (tsval && src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsval = ntohl(tsval); pf_patch_32_unaligned(m, &th->th_sum, &opt[2], htonl(tsval + src->scrub->pfss_ts_mod), PF_ALGNMNT(startoff), 0); copyback = 1; } /* Modulate TS reply iff valid (!0) */ memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); if (tsecr && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsecr = ntohl(tsecr) - dst->scrub->pfss_ts_mod; pf_patch_32_unaligned(m, &th->th_sum, &opt[6], htonl(tsecr), PF_ALGNMNT(startoff), 0); copyback = 1; } got_ts = 1; } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } if (copyback) { /* Copyback the options, caller copys back header */ *writeback = 1; m_copyback(m, off + sizeof(struct tcphdr), (th->th_off << 2) - sizeof(struct tcphdr), hdr + sizeof(struct tcphdr)); } } /* * Must invalidate PAWS checks on connections idle for too long. * The fastest allowed timestamp clock is 1ms. That turns out to * be about 24 days before it wraps. XXX Right now our lowerbound * TS echo check only works for the first 12 days of a connection * when the TS has exhausted half its 32bit space */ #define TS_MAX_IDLE (24*24*60*60) #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ getmicrouptime(&uptime); if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - state->creation > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("src idled out of PAWS\n")); pf_print_state(state); printf("\n"); } src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("dst idled out of PAWS\n")); pf_print_state(state); printf("\n"); } dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (got_ts && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Validate that the timestamps are "in-window". * RFC1323 describes TCP Timestamp options that allow * measurement of RTT (round trip time) and PAWS * (protection against wrapped sequence numbers). PAWS * gives us a set of rules for rejecting packets on * long fat pipes (packets that were somehow delayed * in transit longer than the time it took to send the * full TCP sequence space of 4Gb). We can use these * rules and infer a few others that will let us treat * the 32bit timestamp and the 32bit echoed timestamp * as sequence numbers to prevent a blind attacker from * inserting packets into a connection. * * RFC1323 tells us: * - The timestamp on this packet must be greater than * or equal to the last value echoed by the other * endpoint. The RFC says those will be discarded * since it is a dup that has already been acked. * This gives us a lowerbound on the timestamp. * timestamp >= other last echoed timestamp * - The timestamp will be less than or equal to * the last timestamp plus the time between the * last packet and now. The RFC defines the max * clock rate as 1ms. We will allow clocks to be * up to 10% fast and will allow a total difference * or 30 seconds due to a route change. And this * gives us an upperbound on the timestamp. * timestamp <= last timestamp + max ticks * We have to be careful here. Windows will send an * initial timestamp of zero and then initialize it * to a random value after the 3whs; presumably to * avoid a DoS by having to call an expensive RNG * during a SYN flood. Proof MS has at least one * good security geek. * * - The TCP timestamp option must also echo the other * endpoints timestamp. The timestamp echoed is the * one carried on the earliest unacknowledged segment * on the left edge of the sequence window. The RFC * states that the host will reject any echoed * timestamps that were larger than any ever sent. * This gives us an upperbound on the TS echo. * tescr <= largest_tsval * - The lowerbound on the TS echo is a little more * tricky to determine. The other endpoint's echoed * values will not decrease. But there may be * network conditions that re-order packets and * cause our view of them to decrease. For now the * only lowerbound we can safely determine is that * the TS echo will never be less than the original * TS. XXX There is probably a better lowerbound. * Remove TS_MAX_CONN with better lowerbound check. * tescr >= other original TS * * It is also important to note that the fastest * timestamp clock of 1ms will wrap its 32bit space in * 24 days. So we just disable TS checking after 24 * days of idle time. We actually must use a 12d * connection limit until we can come up with a better * lowerbound to the TS echo check. */ struct timeval delta_ts; int ts_fudge; /* * PFTM_TS_DIFF is how many seconds of leeway to allow * a host's timestamp. This can happen if the previous * packet got delayed in transit for much longer than * this packet. */ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; /* Calculate max ticks since the last timestamp */ #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ #define TS_MICROSECS 1000000 /* microseconds per second */ delta_ts = uptime; timevalsub(&delta_ts, &src->scrub->pfss_last); tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); if ((src->state >= TCPS_ESTABLISHED && dst->state >= TCPS_ESTABLISHED) && (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { /* Bad RFC1323 implementation or an insertion attack. * * - Solaris 2.6 and 2.7 are known to send another ACK * after the FIN,FIN|ACK,ACK closing that carries * an old timestamp. */ DPFPRINTF(("Timestamp failed %c%c%c%c\n", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " "idle: %jus %lums\n", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, delta_ts.tv_usec / 1000)); DPFPRINTF((" src->tsval: %u tsecr: %u\n", src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" "\n", dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } /* XXX I'd really like to require tsecr but it's optional */ } else if (!got_ts && (th->th_flags & TH_RST) == 0 && ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) || pd->p_len > 0 || (th->th_flags & TH_SYN)) && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Didn't send a timestamp. Timestamps aren't really useful * when: * - connection opening or closing (often not even sent). * but we must not let an attacker to put a FIN on a * data packet to sneak it through our ESTABLISHED check. * - on a TCP reset. RFC suggests not even looking at TS. * - on an empty ACK. The TS will not be echoed so it will * probably not help keep the RTT calculation in sync and * there isn't as much danger when the sequence numbers * got wrapped. So some stacks don't include TS on empty * ACKs :-( * * To minimize the disruption to mostly RFC1323 conformant * stacks, we will only require timestamps on data packets. * * And what do ya know, we cannot require timestamps on data * packets. There appear to be devices that do legitimate * TCP connection hijacking. There are HTTP devices that allow * a 3whs (with timestamps) and then buffer the HTTP request. * If the intermediate device has the HTTP response cache, it * will spoof the response but not bother timestamping its * packets. So we can look for the presence of a timestamp in * the first data packet and if there, require it in all future * packets. */ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { /* * Hey! Someone tried to sneak a packet in. Or the * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("Did not receive expected RFC1323 " "timestamp\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } } /* * We will note if a host sends his data packets with or without * timestamps. And require all data packets to contain a timestamp * if the first does. PAWS implicitly requires that all data packets be * timestamped. But I think there are middle-man devices that hijack * TCP streams immediately after the 3whs and don't timestamp their * packets (seen in a WWW accelerator or cache). */ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { if (got_ts) src->scrub->pfss_flags |= PFSS_DATA_TS; else { src->scrub->pfss_flags |= PFSS_DATA_NOTS; if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ DPFPRINTF(("Broken RFC1323 stack did not " "timestamp data packet. Disabled PAWS " "security.\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } } } /* * Update PAWS values */ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { getmicrouptime(&src->scrub->pfss_last); if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsval = tsval; if (tsecr) { if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsecr = tsecr; if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && (SEQ_LT(tsval, src->scrub->pfss_tsval0) || src->scrub->pfss_tsval0 == 0)) { /* tsval0 MUST be the lowest timestamp */ src->scrub->pfss_tsval0 = tsval; } /* Only fully initialized after a TS gets echoed */ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_flags |= PFSS_PAWS; } } /* I have a dream.... TCP segment reassembly.... */ return (0); } int pf_normalize_mss(struct mbuf *m, int off, struct pf_pdesc *pd) { struct tcphdr *th = &pd->hdr.tcp; u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; u_char opts[TCP_MAXOLEN]; u_char *optp = opts; size_t startoff; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, NULL, NULL, pd->af)) return (0); for (; cnt > 0; cnt -= optlen, optp += optlen) { startoff = optp - opts; opt = optp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = optp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: mss = (u_int16_t *)(optp + 2); if ((ntohs(*mss)) > pd->act.max_mss) { pf_patch_16_unaligned(m, &th->th_sum, mss, htons(pd->act.max_mss), PF_ALGNMNT(startoff), 0); m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); m_copyback(m, off, sizeof(*th), (caddr_t)th); } break; default: break; } } return (0); } +static int +pf_scan_sctp(struct mbuf *m, int ipoff, int off, struct pf_pdesc *pd) +{ + struct sctp_chunkhdr ch = { }; + int chunk_off = sizeof(struct sctphdr); + int chunk_start; + + while (off + chunk_off < pd->tot_len) { + if (!pf_pull_hdr(m, off + chunk_off, &ch, sizeof(ch), NULL, + NULL, pd->af)) + return (PF_DROP); + + /* Length includes the header, this must be at least 4. */ + if (ntohs(ch.chunk_length) < 4) + return (PF_DROP); + + chunk_start = chunk_off; + chunk_off += roundup(ntohs(ch.chunk_length), 4); + + switch (ch.chunk_type) { + case SCTP_INITIATION: { + struct sctp_init_chunk init; + + if (!pf_pull_hdr(m, off + chunk_start, &init, + sizeof(init), NULL, NULL, pd->af)) + return (PF_DROP); + + /* + * RFC 9620, Section 3.3.2, "The Initiate Tag is allowed to have + * any value except 0." + */ + if (init.init.initiate_tag == 0) + return (PF_DROP); + if (init.init.num_inbound_streams == 0) + return (PF_DROP); + if (init.init.num_outbound_streams == 0) + return (PF_DROP); + if (ntohl(init.init.a_rwnd) < SCTP_MIN_RWND) + return (PF_DROP); + + /* + * RFC 9260, Section 3.1, INIT chunks MUST have zero + * verification tag. + */ + if (pd->hdr.sctp.v_tag != 0) + return (PF_DROP); + + pd->sctp_flags |= PFDESC_SCTP_INIT; + break; + } + case SCTP_INITIATION_ACK: + pd->sctp_flags |= PFDESC_SCTP_INIT_ACK; + break; + case SCTP_ABORT_ASSOCIATION: + pd->sctp_flags |= PFDESC_SCTP_ABORT; + break; + case SCTP_SHUTDOWN: + case SCTP_SHUTDOWN_ACK: + pd->sctp_flags |= PFDESC_SCTP_SHUTDOWN; + break; + case SCTP_SHUTDOWN_COMPLETE: + pd->sctp_flags |= PFDESC_SCTP_SHUTDOWN_COMPLETE; + break; + case SCTP_COOKIE_ECHO: + case SCTP_COOKIE_ACK: + pd->sctp_flags |= PFDESC_SCTP_COOKIE; + break; + case SCTP_DATA: + pd->sctp_flags |= PFDESC_SCTP_DATA; + break; + default: + pd->sctp_flags |= PFDESC_SCTP_OTHER; + break; + } + } + + /* Validate chunk lengths vs. packet length. */ + if (off + chunk_off != pd->tot_len) + return (PF_DROP); + + /* + * INIT, INIT_ACK or SHUTDOWN_COMPLETE chunks must always be the only + * one in a packet. + */ + if ((pd->sctp_flags & PFDESC_SCTP_INIT) && + (pd->sctp_flags & ~PFDESC_SCTP_INIT)) + return (PF_DROP); + if ((pd->sctp_flags & PFDESC_SCTP_INIT_ACK) && + (pd->sctp_flags & ~PFDESC_SCTP_INIT_ACK)) + return (PF_DROP); + if ((pd->sctp_flags & PFDESC_SCTP_SHUTDOWN_COMPLETE) && + (pd->sctp_flags & ~PFDESC_SCTP_SHUTDOWN_COMPLETE)) + return (PF_DROP); + + return (PF_PASS); +} + +int +pf_normalize_sctp(int dir, struct pfi_kkif *kif, struct mbuf *m, int ipoff, + int off, void *h, struct pf_pdesc *pd) +{ + struct pf_krule *r, *rm = NULL; + struct sctphdr *sh = &pd->hdr.sctp; + u_short reason; + sa_family_t af = pd->af; + int srs; + + PF_RULES_RASSERT(); + + /* Unconditionally scan the SCTP packet, because we need to look for + * things like shutdown and asconf chunks. */ + if (pf_scan_sctp(m, ipoff, off, pd) != PF_PASS) + goto sctp_drop; + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + /* Check if there any scrub rules. Lack of scrub rules means enforced + * packet normalization operation just like in OpenBSD. */ + srs = (r != NULL); + while (r != NULL) { + pf_counter_u64_add(&r->evaluations, 1); + if (pfi_kkif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], sh->src_port)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], sh->dest_port)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else { + rm = r; + break; + } + } + + if (srs) { + /* With scrub rules present SCTP normalization happens only + * if one of rules has matched and it's not a "no scrub" rule */ + if (rm == NULL || rm->action == PF_NOSCRUB) + return (PF_PASS); + + pf_counter_u64_critical_enter(); + pf_counter_u64_add_protected(&r->packets[dir == PF_OUT], 1); + pf_counter_u64_add_protected(&r->bytes[dir == PF_OUT], pd->tot_len); + pf_counter_u64_critical_exit(); + } + + /* Verify we're a multiple of 4 bytes long */ + if ((pd->tot_len - off - sizeof(struct sctphdr)) % 4) + goto sctp_drop; + + /* INIT chunk needs to be the only chunk */ + if (pd->sctp_flags & PFDESC_SCTP_INIT) + if (pd->sctp_flags & ~PFDESC_SCTP_INIT) + goto sctp_drop; + + return (PF_PASS); + +sctp_drop: + REASON_SET(&reason, PFRES_NORM); + if (rm != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, reason, r, NULL, NULL, pd, + 1); + + return (PF_DROP); +} + #ifdef INET void pf_scrub_ip(struct mbuf **m0, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct ip *h = mtod(m, struct ip *); /* Clear IP_DF if no-df was requested */ if (pd->act.flags & PFSTATE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* Enforce a minimum ttl, may cause endless packet loops */ if (pd->act.min_ttl && h->ip_ttl < pd->act.min_ttl) { u_int16_t ip_ttl = h->ip_ttl; h->ip_ttl = pd->act.min_ttl; h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); } /* Enforce tos */ if (pd->act.flags & PFSTATE_SETTOS) { u_int16_t ov, nv; ov = *(u_int16_t *)h; h->ip_tos = pd->act.set_tos | (h->ip_tos & IPTOS_ECN_MASK); nv = *(u_int16_t *)h; h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); } /* random-id, but not for fragments */ if (pd->act.flags & PFSTATE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { uint16_t ip_id = h->ip_id; ip_fillid(h); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); } } #endif /* INET */ #ifdef INET6 void pf_scrub_ip6(struct mbuf **m0, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); /* Enforce a minimum ttl, may cause endless packet loops */ if (pd->act.min_ttl && h->ip6_hlim < pd->act.min_ttl) h->ip6_hlim = pd->act.min_ttl; /* Enforce tos. Set traffic class bits */ if (pd->act.flags & PFSTATE_SETTOS) { h->ip6_flow &= IPV6_FLOWLABEL_MASK | IPV6_VERSION_MASK; h->ip6_flow |= htonl((pd->act.set_tos | IPV6_ECN(h)) << 20); } } #endif