diff --git a/sbin/pfctl/parse.y b/sbin/pfctl/parse.y --- a/sbin/pfctl/parse.y +++ b/sbin/pfctl/parse.y @@ -326,6 +326,7 @@ int marker; #define POM_TYPE 0x01 #define POM_STICKYADDRESS 0x02 +#define POM_ENDPI 0x04 u_int8_t opts; int type; int staticport; @@ -512,7 +513,7 @@ %token UPPERLIMIT QUEUE PRIORITY QLIMIT HOGS BUCKETS RTABLE TARGET INTERVAL %token DNPIPE DNQUEUE RIDENTIFIER %token LOAD RULESET_OPTIMIZATION PRIO -%token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE +%token STICKYADDRESS ENDPI MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW %token TAGGED TAG IFBOUND FLOATING STATEPOLICY STATEDEFAULTS ROUTE SETTOS %token DIVERTTO DIVERTREPLY BRIDGE_TO @@ -4593,6 +4594,14 @@ pool_opts.marker |= POM_STICKYADDRESS; pool_opts.opts |= PF_POOL_STICKYADDR; } + | ENDPI { + if (pool_opts.marker & POM_ENDPI) { + yyerror("endpoint-independent cannot be redefined"); + YYERROR; + } + pool_opts.marker |= POM_ENDPI; + pool_opts.opts |= PF_POOL_ENDPI; + } | MAPEPORTSET number '/' number '/' number { if (pool_opts.mape.offset) { yyerror("map-e-portset cannot be redefined"); @@ -6299,6 +6308,7 @@ { "dnqueue", DNQUEUE}, { "drop", DROP}, { "dup-to", DUPTO}, + { "endpoint-independent", ENDPI}, { "ether", ETHER}, { "fail-policy", FAILPOLICY}, { "fairq", FAIRQ}, diff --git a/sbin/pfctl/pfctl_parser.c b/sbin/pfctl/pfctl_parser.c --- a/sbin/pfctl/pfctl_parser.c +++ b/sbin/pfctl/pfctl_parser.c @@ -488,6 +488,8 @@ } if (pool->opts & PF_POOL_STICKYADDR) printf(" sticky-address"); + if (pool->opts & PF_POOL_ENDPI) + printf(" endpoint-independent"); if (id == PF_NAT && p1 == 0 && p2 == 0) printf(" static-port"); if (pool->mape.offset > 0) diff --git a/sbin/pfctl/tests/files/pf1021.in b/sbin/pfctl/tests/files/pf1021.in new file mode 100644 --- /dev/null +++ b/sbin/pfctl/tests/files/pf1021.in @@ -0,0 +1 @@ +nat on vtnet1 inet from ! (vtnet1) to any -> (vtnet1) endpoint-independent diff --git a/sbin/pfctl/tests/files/pf1021.ok b/sbin/pfctl/tests/files/pf1021.ok new file mode 100644 --- /dev/null +++ b/sbin/pfctl/tests/files/pf1021.ok @@ -0,0 +1 @@ +nat on vtnet1 inet from ! (vtnet1) to any -> (vtnet1) round-robin endpoint-independent diff --git a/share/man/man4/pf.4 b/share/man/man4/pf.4 --- a/share/man/man4/pf.4 +++ b/share/man/man4/pf.4 @@ -26,7 +26,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 2, 2024 +.Dd September 6, 2024 .Dt PF 4 .Os .Sh NAME @@ -89,6 +89,10 @@ Default value is 32768. .It Va net.pf.rule_tag_hashsize Size of the hash table that stores tags. +.It Va net.pf.udpendpoint_hashsize +Size of hash table that store UDP endpoint mappings. +Should be power of 2. +Default value is 32768. .It Va net.pf.default_to_drop This value overrides .Cd "options PF_DEFAULT_TO_DROP" diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5 --- a/share/man/man5/pf.conf.5 +++ b/share/man/man5/pf.conf.5 @@ -27,7 +27,7 @@ .\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd June 24, 2024 +.Dd September 4, 2024 .Dt PF.CONF 5 .Os .Sh NAME @@ -2278,6 +2278,16 @@ With .Ar nat rules, the +.It Ar endpoint-independent +With +.Ar nat +rules, the +.Ar endpoint-independent +option caues +.Xr pf 4 +to always map connections from a UDP source address and port to the same +NAT address and port. +This feature implements "full-cone" NAT behavior. .Ar map-e-portset option enables the source port translation of MAP-E (RFC 7597) Customer Edge. In order to make the host act as a MAP-E Customer Edge, setting up a tunneling diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -940,6 +940,29 @@ u_int8_t pad[1]; }; +/* Keep synced with struct pf_udp_endpoint. */ +struct pf_udp_endpoint_cmp { + struct pf_addr addr; + uint16_t port; + sa_family_t af; + uint8_t pad[1]; +}; + +struct pf_udp_endpoint { + struct pf_addr addr; + uint16_t port; + sa_family_t af; + uint8_t pad[1]; + + struct pf_udp_mapping *mapping; + LIST_ENTRY(pf_udp_endpoint) entry; +}; + +struct pf_udp_mapping { + struct pf_udp_endpoint endpoints[2]; + u_int refs; +}; + /* Keep synced with struct pf_state_key. */ struct pf_state_key_cmp { struct pf_addr addr[2]; @@ -1069,6 +1092,7 @@ union pf_krule_ptr nat_rule; struct pf_addr rt_addr; struct pf_state_key *key[2]; /* addresses stack and wire */ + struct pf_udp_mapping *udp_mapping; struct pfi_kkif *kif; struct pfi_kkif *orig_kif; /* The real kif, even if we're a floating state (i.e. if == V_pfi_all). */ struct pfi_kkif *rt_kif; @@ -2124,17 +2148,28 @@ struct mtx lock; }; +struct pf_udpendpointhash { + LIST_HEAD(, pf_udp_endpoint) endpoints; + /* refcont is synchronized on the source endpoint's row lock */ + struct mtx lock; +}; + extern u_long pf_ioctl_maxcount; VNET_DECLARE(u_long, pf_hashmask); #define V_pf_hashmask VNET(pf_hashmask) VNET_DECLARE(u_long, pf_srchashmask); #define V_pf_srchashmask VNET(pf_srchashmask) +VNET_DECLARE(u_long, pf_udpendpointhashmask); +#define V_pf_udpendpointhashmask VNET(pf_udpendpointhashmask) #define PF_HASHSIZ (131072) #define PF_SRCHASHSIZ (PF_HASHSIZ/4) +#define PF_UDPENDHASHSIZ (PF_HASHSIZ/4) VNET_DECLARE(struct pf_keyhash *, pf_keyhash); VNET_DECLARE(struct pf_idhash *, pf_idhash); +VNET_DECLARE(struct pf_udpendpointhash *, pf_udpendpointhash); #define V_pf_keyhash VNET(pf_keyhash) #define V_pf_idhash VNET(pf_idhash) +#define V_pf_udpendpointhash VNET(pf_udpendpointhash) VNET_DECLARE(struct pf_srchash *, pf_srchash); #define V_pf_srchash VNET(pf_srchash) @@ -2209,6 +2244,8 @@ #define V_pf_state_z VNET(pf_state_z) VNET_DECLARE(uma_zone_t, pf_state_key_z); #define V_pf_state_key_z VNET(pf_state_key_z) +VNET_DECLARE(uma_zone_t, pf_udp_mapping_z); +#define V_pf_udp_mapping_z VNET(pf_udp_mapping_z) VNET_DECLARE(uma_zone_t, pf_state_scrub_z); #define V_pf_state_scrub_z VNET(pf_state_scrub_z) @@ -2281,6 +2318,15 @@ extern bool pf_find_state_all_exists( const struct pf_state_key_cmp *, u_int); +extern struct pf_udp_mapping *pf_udp_mapping_find(struct pf_udp_endpoint_cmp + *endpoint); +extern struct pf_udp_mapping *pf_udp_mapping_create(sa_family_t af, + struct pf_addr *src_addr, uint16_t src_port, + struct pf_addr *nat_addr, uint16_t nat_port); +extern int pf_udp_mapping_insert(struct pf_udp_mapping + *mapping); +extern void pf_udp_mapping_release(struct pf_udp_mapping + *mapping); extern struct pf_ksrc_node *pf_find_src_node(struct pf_addr *, struct pf_krule *, sa_family_t, struct pf_srchash **, bool); @@ -2574,7 +2620,8 @@ struct pf_state_key **, struct pf_state_key **, struct pf_addr *, struct pf_addr *, uint16_t, uint16_t, struct pf_kanchor_stackframe *, - struct pf_krule **); + struct pf_krule **, + struct pf_udp_mapping **udp_mapping); struct pf_state_key *pf_state_key_setup(struct pf_pdesc *, struct mbuf *, int, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t); diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h --- a/sys/netpfil/pf/pf.h +++ b/sys/netpfil/pf/pf.h @@ -129,6 +129,7 @@ PF_ADDR_RANGE }; #define PF_POOL_TYPEMASK 0x0f #define PF_POOL_STICKYADDR 0x20 +#define PF_POOL_ENDPI 0x40 #define PF_WSCALE_FLAG 0x80 #define PF_WSCALE_MASK 0x0f diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -283,6 +283,7 @@ uma_zone_t pf_mtag_z; VNET_DEFINE(uma_zone_t, pf_state_z); VNET_DEFINE(uma_zone_t, pf_state_key_z); +VNET_DEFINE(uma_zone_t, pf_udp_mapping_z); VNET_DEFINE(struct unrhdr64, pf_stateid); @@ -330,7 +331,7 @@ struct pf_state_key *, struct mbuf *, int, u_int16_t, u_int16_t, int *, struct pfi_kkif *, struct pf_kstate **, int, u_int16_t, u_int16_t, - int, struct pf_krule_slist *); + int, struct pf_krule_slist *, struct pf_udp_mapping *); static int pf_state_key_addr_setup(struct pf_pdesc *, struct mbuf *, int, struct pf_state_key_cmp *, int, struct pf_addr *, int, struct pf_addr *, int); @@ -493,22 +494,29 @@ VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(struct pf_srchash *, pf_srchash); +VNET_DEFINE(struct pf_udpendpointhash *, pf_udpendpointhash); +VNET_DEFINE(struct pf_udpendpointmapping *, pf_udpendpointmapping); SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "pf(4)"); VNET_DEFINE(u_long, pf_hashmask); VNET_DEFINE(u_long, pf_srchashmask); +VNET_DEFINE(u_long, pf_udpendpointhashmask); VNET_DEFINE_STATIC(u_long, pf_hashsize); #define V_pf_hashsize VNET(pf_hashsize) VNET_DEFINE_STATIC(u_long, pf_srchashsize); #define V_pf_srchashsize VNET(pf_srchashsize) +VNET_DEFINE_STATIC(u_long, pf_udpendpointhashsize); +#define V_pf_udpendpointhashsize VNET(pf_udpendpointhashsize) u_long pf_ioctl_maxcount = 65535; SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable"); +SYSCTL_ULONG(_net_pf, OID_AUTO, udpendpoint_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, + &VNET_NAME(pf_udpendpointhashsize), 0, "Size of pf(4) endpoint hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN, &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call"); @@ -699,6 +707,17 @@ return (h & V_pf_srchashmask); } +static inline uint32_t +pf_hashudpendpoint(struct pf_udp_endpoint *endpoint) +{ + uint32_t h; + + h = murmur3_32_hash32((uint32_t *)endpoint, + sizeof(struct pf_udp_endpoint_cmp)/sizeof(uint32_t), + V_pf_hashseed); + return (h & V_pf_udpendpointhashmask); +} + #ifdef ALTQ static int pf_state_hash(struct pf_kstate *s) @@ -1086,12 +1105,15 @@ struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; + struct pf_udpendpointhash *uh; u_int i; if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize)) V_pf_hashsize = PF_HASHSIZ; if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize)) V_pf_srchashsize = PF_SRCHASHSIZ; + if (V_pf_udpendpointhashsize == 0 || !powerof2(V_pf_udpendpointhashsize)) + V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ; V_pf_hashseed = arc4random(); @@ -1154,6 +1176,30 @@ for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); + + /* UDP endpoint mappings. */ + V_pf_udp_mapping_z = uma_zcreate("pf UDP mappings", + sizeof(struct pf_udp_mapping), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize, + sizeof(struct pf_udpendpointhash), M_PFHASH, M_NOWAIT | M_ZERO); + if (V_pf_udpendpointhash == NULL) { + printf("pf: Unable to allocate memory for " + "udpendpoint_hashsize %lu.\n", V_pf_udpendpointhashsize); + + V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ; + V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize, + sizeof(struct pf_udpendpointhash), M_PFHASH, M_WAITOK | M_ZERO); + } + + V_pf_udpendpointhashmask = V_pf_udpendpointhashsize - 1; + for (i = 0, uh = V_pf_udpendpointhash; + i <= V_pf_udpendpointhashmask; + i++, uh++) { + mtx_init(&uh->lock, "pf_udpendpointhash", NULL, + MTX_DEF | MTX_DUPOK); + } + /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); @@ -1187,10 +1233,12 @@ struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; + struct pf_udpendpointhash *uh; struct pf_send_entry *pfse, *next; u_int i; - for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask; + for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; + i <= V_pf_hashmask; i++, kh++, ih++) { KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", __func__)); @@ -1209,6 +1257,15 @@ } free(V_pf_srchash, M_PFHASH); + for (i = 0, uh = V_pf_udpendpointhash; + i <= V_pf_udpendpointhashmask; + i++, uh++) { + KASSERT(LIST_EMPTY(&uh->endpoints), + ("%s: udp endpoint hash not empty", __func__)); + mtx_destroy(&uh->lock); + } + free(V_pf_udpendpointhash, M_PFHASH); + STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { m_freem(pfse->pfse_m); free(pfse, M_PFTEMP); @@ -1218,6 +1275,7 @@ uma_zdestroy(V_pf_sources_z); uma_zdestroy(V_pf_state_z); uma_zdestroy(V_pf_state_key_z); + uma_zdestroy(V_pf_udp_mapping_z); } static int @@ -1807,6 +1865,123 @@ return (false); } +struct pf_udp_mapping * +pf_udp_mapping_create(sa_family_t af, struct pf_addr *src_addr, uint16_t src_port, + struct pf_addr *nat_addr, uint16_t nat_port) +{ + struct pf_udp_mapping *mapping; + + mapping = uma_zalloc(V_pf_udp_mapping_z, M_NOWAIT | M_ZERO); + if (mapping == NULL) + return (NULL); + PF_ACPY(&mapping->endpoints[0].addr, src_addr, af); + mapping->endpoints[0].port = src_port; + mapping->endpoints[0].af = af; + mapping->endpoints[0].mapping = mapping; + PF_ACPY(&mapping->endpoints[1].addr, nat_addr, af); + mapping->endpoints[1].port = nat_port; + mapping->endpoints[1].af = af; + mapping->endpoints[1].mapping = mapping; + refcount_init(&mapping->refs, 1); + return (mapping); +} + +int +pf_udp_mapping_insert(struct pf_udp_mapping *mapping) +{ + struct pf_udpendpointhash *h0, *h1; + struct pf_udp_endpoint *endpoint; + int ret = EEXIST; + + h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])]; + h1 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])]; + if (h0 == h1) { + PF_HASHROW_LOCK(h0); + } else if (h0 < h1) { + PF_HASHROW_LOCK(h0); + PF_HASHROW_LOCK(h1); + } else { + PF_HASHROW_LOCK(h1); + PF_HASHROW_LOCK(h0); + } + + LIST_FOREACH(endpoint, &h0->endpoints, entry) { + if (bcmp(endpoint, &mapping->endpoints[0], + sizeof(struct pf_udp_endpoint_cmp)) == 0) + break; + } + if (endpoint != NULL) + goto cleanup; + LIST_FOREACH(endpoint, &h1->endpoints, entry) { + if (bcmp(endpoint, &mapping->endpoints[1], + sizeof(struct pf_udp_endpoint_cmp)) == 0) + break; + } + if (endpoint != NULL) + goto cleanup; + LIST_INSERT_HEAD(&h0->endpoints, &mapping->endpoints[0], entry); + LIST_INSERT_HEAD(&h1->endpoints, &mapping->endpoints[1], entry); + ret = 0; + +cleanup: + if (h0 != h1) { + PF_HASHROW_UNLOCK(h0); + PF_HASHROW_UNLOCK(h1); + } else { + PF_HASHROW_UNLOCK(h0); + } + return (ret); +} + +void +pf_udp_mapping_release(struct pf_udp_mapping *mapping) +{ + /* refcount is synchronized on the source endpoint's row lock */ + struct pf_udpendpointhash *h0, *h1; + + if (mapping == NULL) + return; + + h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])]; + PF_HASHROW_LOCK(h0); + if (refcount_release(&mapping->refs)) { + LIST_REMOVE(&mapping->endpoints[0], entry); + PF_HASHROW_UNLOCK(h0); + h1 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])]; + PF_HASHROW_LOCK(h1); + LIST_REMOVE(&mapping->endpoints[1], entry); + PF_HASHROW_UNLOCK(h1); + + uma_zfree(V_pf_udp_mapping_z, mapping); + } else { + PF_HASHROW_UNLOCK(h0); + } +} + + +struct pf_udp_mapping * +pf_udp_mapping_find(struct pf_udp_endpoint_cmp *key) +{ + struct pf_udpendpointhash *uh; + struct pf_udp_endpoint *endpoint; + + uh = &V_pf_udpendpointhash[pf_hashudpendpoint((struct pf_udp_endpoint*)key)]; + + PF_HASHROW_LOCK(uh); + LIST_FOREACH(endpoint, &uh->endpoints, entry) { + if (bcmp(endpoint, key, sizeof(struct pf_udp_endpoint_cmp)) == 0 && + bcmp(endpoint, &endpoint->mapping->endpoints[0], + sizeof(struct pf_udp_endpoint_cmp)) == 0) + break; + } + if (endpoint == NULL) { + PF_HASHROW_UNLOCK(uh); + return (NULL); + } + refcount_acquire(&endpoint->mapping->refs); + PF_HASHROW_UNLOCK(uh); + return (endpoint->mapping); +} /* END state table stuff */ static void @@ -2423,6 +2598,9 @@ PF_HASHROW_UNLOCK(ih); pf_detach_state(s); + + pf_udp_mapping_release(s->udp_mapping); + /* pf_state_insert() initialises refs to 2 */ return (pf_release_staten(s, 2)); } @@ -4686,6 +4864,7 @@ u_int16_t bproto_sum = 0, bip_sum = 0; u_int8_t icmptype = 0, icmpcode = 0; struct pf_kanchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; + struct pf_udp_mapping *udp_mapping = NULL; PF_RULES_RASSERT(); @@ -4760,7 +4939,7 @@ /* check packet for BINAT/NAT/RDR */ transerror = pf_get_translation(pd, m, off, kif, &nsn, &sk, - &nk, saddr, daddr, sport, dport, anchor_stack, &nr); + &nk, saddr, daddr, sport, dport, anchor_stack, &nr, &udp_mapping); switch (transerror) { default: /* A translation error occurred. */ @@ -5058,8 +5237,9 @@ int action; action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, - hdrlen, &match_rules); + hdrlen, &match_rules, udp_mapping); if (action != PF_PASS) { + pf_udp_mapping_release(udp_mapping); if (action == PF_DROP && (r->rule_flag & PFRULE_RETURN)) pf_return(r, nr, pd, sk, off, m, th, kif, @@ -5075,6 +5255,7 @@ uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); + pf_udp_mapping_release(udp_mapping); } /* copy back packet headers if we performed NAT operations */ @@ -5102,6 +5283,8 @@ uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); + pf_udp_mapping_release(udp_mapping); + return (PF_DROP); } @@ -5111,7 +5294,7 @@ struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm, int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, - struct pf_krule_slist *match_rules) + struct pf_krule_slist *match_rules, struct pf_udp_mapping *udp_mapping) { struct pf_kstate *s = NULL; struct pf_ksrc_node *sn = NULL; @@ -5328,6 +5511,8 @@ return (PF_SYNPROXY_DROP); } + s->udp_mapping = udp_mapping; + return (PF_PASS); csfailed: diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -62,7 +62,8 @@ uint16_t, int, struct pf_kanchor_stackframe *); static int pf_get_sport(sa_family_t, uint8_t, struct pf_krule *, struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *, - uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **); + uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **, + struct pf_udp_mapping **); #define mix(a,b,c) \ do { \ @@ -216,14 +217,47 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r, struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low, - uint16_t high, struct pf_ksrc_node **sn) + uint16_t high, struct pf_ksrc_node **sn, + struct pf_udp_mapping **udp_mapping) { struct pf_state_key_cmp key; struct pf_addr init_addr; + struct pf_srchash *sh = NULL; bzero(&init_addr, sizeof(init_addr)); + + MPASS(*udp_mapping == NULL); + + /* + * If we are UDP and have an existing mapping we can get source port + * from the mapping. In this case we have to look up the src_node as + * pf_map_addr would. + */ + if (proto == IPPROTO_UDP && (r->rpool.opts & PF_POOL_ENDPI)) { + struct pf_udp_endpoint_cmp udp_source; + + bzero(&udp_source, sizeof(udp_source)); + udp_source.af = af; + PF_ACPY(&udp_source.addr, saddr, af); + udp_source.port = sport; + *udp_mapping = pf_udp_mapping_find(&udp_source); + if (*udp_mapping) { + PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, af); + *nport = (*udp_mapping)->endpoints[1].port; + /* Try to find a src_node as per pf_map_addr(). */ + if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && + (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) + *sn = pf_find_src_node(saddr, r, af, &sh, 0); + return (0); + } else { + *udp_mapping = pf_udp_mapping_create(af, saddr, sport, &init_addr, 0); + if (*udp_mapping == NULL) + return (1); + } + } + if (pf_map_addr(af, r, saddr, naddr, NULL, &init_addr, sn)) - return (1); + goto failed; if (proto == IPPROTO_ICMP) { if (*nport == htons(ICMP_ECHO)) { @@ -250,6 +284,8 @@ do { PF_ACPY(&key.addr[1], naddr, key.af); + if (*udp_mapping) + PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, af); /* * port search; start random, step; @@ -277,8 +313,16 @@ } else if (low == high) { key.port[1] = htons(low); if (!pf_find_state_all_exists(&key, PF_IN)) { - *nport = htons(low); - return (0); + if (*udp_mapping != NULL) { + (*udp_mapping)->endpoints[1].port = htons(low); + if (pf_udp_mapping_insert(*udp_mapping) == 0) { + *nport = htons(low); + return (0); + } + } else { + *nport = htons(low); + return (0); + } } } else { uint32_t tmp; @@ -293,18 +337,35 @@ cut = arc4random() % (1 + high - low) + low; /* low <= cut <= high */ for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) { - key.port[1] = htons(tmp); - if (!pf_find_state_all_exists(&key, PF_IN)) { - *nport = htons(tmp); - return (0); + if (*udp_mapping != NULL) { + (*udp_mapping)->endpoints[1].port = htons(tmp); + if (pf_udp_mapping_insert(*udp_mapping) == 0) { + *nport = htons(tmp); + return (0); + } + } else { + key.port[1] = htons(tmp); + if (!pf_find_state_all_exists(&key, PF_IN)) { + *nport = htons(tmp); + return (0); + } } } tmp = cut; for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) { - key.port[1] = htons(tmp); - if (!pf_find_state_all_exists(&key, PF_IN)) { - *nport = htons(tmp); - return (0); + if (proto == IPPROTO_UDP && + (r->rpool.opts & PF_POOL_ENDPI)) { + (*udp_mapping)->endpoints[1].port = htons(tmp); + if (pf_udp_mapping_insert(*udp_mapping) == 0) { + *nport = htons(tmp); + return (0); + } + } else { + key.port[1] = htons(tmp); + if (!pf_find_state_all_exists(&key, PF_IN)) { + *nport = htons(tmp); + return (0); + } } } } @@ -326,6 +387,10 @@ return (1); } } while (! PF_AEQ(&init_addr, naddr, af) ); + +failed: + uma_zfree(V_pf_udp_mapping_z, *udp_mapping); + *udp_mapping = NULL; return (1); /* none available */ } @@ -333,7 +398,7 @@ pf_get_mape_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r, struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, uint16_t dport, struct pf_addr *naddr, uint16_t *nport, - struct pf_ksrc_node **sn) + struct pf_ksrc_node **sn, struct pf_udp_mapping **udp_mapping) { uint16_t psmask, low, highmask; uint16_t i, ahigh, cut; @@ -353,13 +418,13 @@ for (i = cut; i <= ahigh; i++) { low = (i << ashift) | psmask; if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport, - naddr, nport, low, low | highmask, sn)) + naddr, nport, low, low | highmask, sn, udp_mapping)) return (0); } for (i = cut - 1; i > 0; i--) { low = (i << ashift) | psmask; if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport, - naddr, nport, low, low | highmask, sn)) + naddr, nport, low, low | highmask, sn, udp_mapping)) return (0); } return (1); @@ -597,7 +662,8 @@ struct pf_state_key **skp, struct pf_state_key **nkp, struct pf_addr *saddr, struct pf_addr *daddr, uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack, - struct pf_krule **rp) + struct pf_krule **rp, + struct pf_udp_mapping **udp_mapping) { struct pf_krule *r = NULL; struct pf_addr *naddr; @@ -661,7 +727,7 @@ } if (r->rpool.mape.offset > 0) { if (pf_get_mape_sport(pd->af, pd->proto, r, saddr, - sport, daddr, dport, naddr, nportp, sn)) { + sport, daddr, dport, naddr, nportp, sn, udp_mapping)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: MAP-E port allocation (%u/%u/%u)" " failed\n", @@ -672,7 +738,7 @@ goto notrans; } } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, - daddr, dport, naddr, nportp, low, high, sn)) { + daddr, dport, naddr, nportp, low, high, sn, udp_mapping)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); diff --git a/tests/sys/netpfil/pf/nat.sh b/tests/sys/netpfil/pf/nat.sh --- a/tests/sys/netpfil/pf/nat.sh +++ b/tests/sys/netpfil/pf/nat.sh @@ -112,6 +112,139 @@ } +atf_test_case "endpoint_independent" "cleanup" +endpoint_independent_head() +{ + atf_set descr 'Test that a client behind NAT gets the same external IP:port for different servers' + atf_set require.user root +} + +endpoint_independent_body() +{ + pft_init + filter="udp and dst port 1234" # only capture udp pings + + epair_client=$(vnet_mkepair) + epair_nat=$(vnet_mkepair) + epair_server1=$(vnet_mkepair) + epair_server2=$(vnet_mkepair) + bridge=$(vnet_mkbridge) + + vnet_mkjail nat ${epair_client}b ${epair_nat}a + vnet_mkjail client ${epair_client}a + vnet_mkjail server1 ${epair_server1}a + vnet_mkjail server2 ${epair_server2}a + + ifconfig ${epair_server1}b up + ifconfig ${epair_server2}b up + ifconfig ${epair_nat}b up + ifconfig ${bridge} \ + addm ${epair_server1}b \ + addm ${epair_server2}b \ + addm ${epair_nat}b \ + up + + jexec nat ifconfig ${epair_client}b 192.0.2.1/24 up + jexec nat ifconfig ${epair_nat}a 198.51.100.42/24 up + jexec nat sysctl net.inet.ip.forwarding=1 + + jexec client ifconfig ${epair_client}a 192.0.2.2/24 up + jexec client route add default 192.0.2.1 + + jexec server1 ifconfig ${epair_server1}a 198.51.100.32/24 up + jexec server2 ifconfig ${epair_server2}a 198.51.100.22/24 up + + # Enable pf! + jexec nat pfctl -e + + # validate non-endpoint independent nat rule behaviour + pft_set_rules nat \ + "nat on ${epair_nat}a inet from ! (${epair_nat}a) to any -> (${epair_nat}a)" + + jexec server1 tcpdump -i ${epair_server1}a -w ${PWD}/server1.pcap \ + --immediate-mode $filter & + server1tcppid="$!" + jexec server2 tcpdump -i ${epair_server2}a -w ${PWD}/server2.pcap \ + --immediate-mode $filter & + server2tcppid="$!" + + # send out multiple packets + for i in $(seq 1 10); do + echo "ping" | jexec client nc -u 198.51.100.32 1234 -p 4242 -w 0 + echo "ping" | jexec client nc -u 198.51.100.22 1234 -p 4242 -w 0 + done + + kill $server1tcppid + kill $server2tcppid + + tuple_server1=$(tcpdump -r ${PWD}/server1.pcap | awk '{addr=$3} END {print addr}') + tuple_server2=$(tcpdump -r ${PWD}/server2.pcap | awk '{addr=$3} END {print addr}') + + if [ -z $tuple_server1 ] + then + atf_fail "server1 did not receive connection from client (default)" + fi + + if [ -z $tuple_server2 ] + then + atf_fail "server2 did not receive connection from client (default)" + fi + + if [ "$tuple_server1" = "$tuple_server2" ] + then + echo "server1 tcpdump: $tuple_server1" + echo "server2 tcpdump: $tuple_server2" + atf_fail "Received same IP:port on server1 and server2 (default)" + fi + + # validate endpoint independent nat rule behaviour + pft_set_rules nat \ + "nat on ${epair_nat}a inet from ! (${epair_nat}a) to any -> (${epair_nat}a) endpoint-independent" + + jexec server1 tcpdump -i ${epair_server1}a -w ${PWD}/server1.pcap \ + --immediate-mode $filter & + server1tcppid="$!" + jexec server2 tcpdump -i ${epair_server2}a -w ${PWD}/server2.pcap \ + --immediate-mode $filter & + server2tcppid="$!" + + # send out multiple packets, sometimes one fails to go through + for i in $(seq 1 10); do + echo "ping" | jexec client nc -u 198.51.100.32 1234 -p 4242 -w 0 + echo "ping" | jexec client nc -u 198.51.100.22 1234 -p 4242 -w 0 + done + + kill $server1tcppid + kill $server2tcppid + + tuple_server1=$(tcpdump -r ${PWD}/server1.pcap | awk '{addr=$3} END {print addr}') + tuple_server2=$(tcpdump -r ${PWD}/server2.pcap | awk '{addr=$3} END {print addr}') + + if [ -z $tuple_server1 ] + then + atf_fail "server1 did not receive connection from client (endpoint-independent)" + fi + + if [ -z $tuple_server2 ] + then + atf_fail "server2 did not receive connection from client (endpoint-independent)" + fi + + if [ ! "$tuple_server1" = "$tuple_server2" ] + then + echo "server1 tcpdump: $tuple_server1" + echo "server2 tcpdump: $tuple_server2" + atf_fail "Received different IP:port on server1 than server2 (endpoint-independent)" + fi +} + +endpoint_independent_cleanup() +{ + pft_cleanup + rm -f server1.out + rm -f server2.out +} + nested_anchor_cleanup() { pft_cleanup @@ -121,4 +254,5 @@ { atf_add_test_case "exhaust" atf_add_test_case "nested_anchor" + atf_add_test_case "endpoint_independent" }