diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -884,6 +884,8 @@ SLIST_HEAD(pf_krule_slist, pf_krule_item); +enum pf_sn_types { PF_SN_LIMIT, PF_SN_NAT, PF_SN_ROUTE, PF_SN_MAX }; + struct pf_ksrc_node { LIST_ENTRY(pf_ksrc_node) entry; struct pf_addr addr; @@ -900,6 +902,7 @@ u_int32_t expire; sa_family_t af; u_int8_t ruletype; + enum pf_sn_types type; struct mtx *lock; }; #endif @@ -1098,8 +1101,8 @@ struct pfi_kkif *kif; struct pfi_kkif *orig_kif; /* The real kif, even if we're a floating state (i.e. if == V_pfi_all). */ struct pfi_kkif *rt_kif; - struct pf_ksrc_node *src_node; - struct pf_ksrc_node *nat_src_node; + struct pf_ksrc_node *sns[PF_SN_MAX];/* source nodes */ + struct pf_srchash *sh; /* source nodes hash row */ u_int64_t packets[2]; u_int64_t bytes[2]; u_int64_t creation; @@ -1113,9 +1116,10 @@ }; /* - * Size <= fits 11 objects per page on LP64. Try to not grow the struct beyond that. +* 6 cache lines per struct, 11 structs per page. +* Try to not grow the struct beyond that. */ -_Static_assert(sizeof(struct pf_kstate) <= 372, "pf_kstate size crosses 372 bytes"); +_Static_assert(sizeof(struct pf_kstate) <= 384, "pf_kstate size crosses 384 bytes"); #endif /* @@ -1584,6 +1588,13 @@ struct pf_sctp_multihome_job; TAILQ_HEAD(pf_sctp_multihome_jobs, pf_sctp_multihome_job); +/* Variables available only during ruleset evaluation */ +struct pf_test_ctx { + struct pf_ksrc_node *sns[PF_SN_MAX];/* source nodes */ + struct pf_srchash *sh; /* source nodes hash row */ +}; + +/* Variables available during packet forwarding */ struct pf_pdesc { struct { int done; @@ -1602,13 +1613,13 @@ char any[0]; } hdr; - struct pf_krule *nat_rule; /* nat/rdr rule applied to packet */ - struct pf_addr *src; /* src address */ - struct pf_addr *dst; /* dst address */ - u_int16_t *sport; - u_int16_t *dport; - struct pf_mtag *pf_mtag; - struct pf_rule_actions act; + struct pf_krule *nat_rule; /* nat/rdr rule applied to packet */ + struct pf_addr *src; /* src address */ + struct pf_addr *dst; /* dst address */ + u_int16_t *sport; + u_int16_t *dport; + struct pf_mtag *pf_mtag; + struct pf_rule_actions act; u_int32_t p_len; /* total length of payload */ u_int32_t badopts; /* v4 options or v6 routing headers */ @@ -2333,9 +2344,12 @@ *mapping); extern void pf_udp_mapping_release(struct pf_udp_mapping *mapping); -extern struct pf_ksrc_node *pf_find_src_node(struct pf_addr *, - struct pf_krule *, sa_family_t, - struct pf_srchash **, bool); +u_short pf_find_src_node(struct pf_test_ctx *, + struct pf_pdesc *, struct pf_addr *, + struct pf_krule *, enum pf_sn_types); +u_short pf_insert_src_node(struct pf_test_ctx *, + struct pf_pdesc *pd, struct pf_addr *, + struct pf_krule *, enum pf_sn_types); extern void pf_unlink_src_node(struct pf_ksrc_node *); extern u_int pf_free_src_nodes(struct pf_ksrc_node_list *); extern void pf_print_state(struct pf_kstate *); @@ -2619,14 +2633,15 @@ struct pf_keth_rule **, struct pf_keth_rule **, int *); -u_short pf_map_addr(u_int8_t, struct pf_krule *, +u_short pf_map_addr(struct pf_pdesc *, struct pf_krule *, struct pf_addr *, struct pf_addr *, struct pfi_kkif **nkif, struct pf_addr *); -u_short pf_map_addr_sn(u_int8_t, struct pf_krule *, - struct pf_addr *, struct pf_addr *, - struct pfi_kkif **nkif, struct pf_addr *, - struct pf_ksrc_node **); -u_short pf_get_translation(struct pf_pdesc *, struct mbuf *, +u_short pf_map_addr_sn(struct pf_test_ctx *, struct pf_pdesc *, + struct pf_krule *, struct pf_addr *, + struct pf_addr *, struct pfi_kkif **nkif, + struct pf_addr *, enum pf_sn_types); +u_short pf_get_translation(struct pf_test_ctx *, + struct pf_pdesc *, struct mbuf *, int, struct pfi_kkif *, struct pf_ksrc_node **, struct pf_state_key **, struct pf_state_key **, struct pf_addr *, struct pf_addr *, diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -323,13 +323,14 @@ struct pfi_kkif *, struct mbuf *, int, struct pf_pdesc *, struct pf_krule **, struct pf_kruleset **, struct inpcb *, int); -static int pf_create_state(struct pf_krule *, struct pf_krule *, +static int pf_create_state(struct pf_test_ctx *, + struct pf_krule *, struct pf_krule *, struct pf_krule *, struct pf_pdesc *, - struct pf_ksrc_node *, struct pf_state_key *, - struct pf_state_key *, struct mbuf *, int, - u_int16_t, u_int16_t, int *, struct pfi_kkif *, - struct pf_kstate **, int, u_int16_t, u_int16_t, - int, struct pf_krule_slist *, struct pf_udp_mapping *); + struct pf_state_key *, struct pf_state_key *, + struct mbuf *, int, u_int16_t, u_int16_t, int *, + struct pfi_kkif *, struct pf_kstate **, int, + u_int16_t, u_int16_t, int, struct pf_krule_slist *, + struct pf_udp_mapping *); static int pf_state_key_addr_setup(struct pf_pdesc *, struct mbuf *, int, struct pf_state_key_cmp *, int, struct pf_addr *, int, struct pf_addr *, int); @@ -369,15 +370,15 @@ bool, u_int8_t); static struct pf_kstate *pf_find_state(struct pfi_kkif *, const struct pf_state_key_cmp *, u_int); -static int pf_src_connlimit(struct pf_kstate **); +static bool pf_find_src_node_ptr(struct pf_srchash *, + struct pf_ksrc_node *); +static int pf_src_connlimit(struct pf_kstate *); static int pf_match_rcvif(struct mbuf *, struct pf_krule *); static void pf_counters_inc(int, struct pf_pdesc *, struct pfi_kkif *, struct pf_kstate *, struct pf_krule *, struct pf_krule *); static void pf_overload_task(void *v, int pending); -static u_short pf_insert_src_node(struct pf_ksrc_node **, - struct pf_krule *, struct pf_addr *, sa_family_t); static u_int pf_purge_expired_states(u_int, int); static void pf_purge_unlinked_rules(void); static int pf_mtag_uminit(void *, int, int); @@ -813,59 +814,71 @@ } static int -pf_src_connlimit(struct pf_kstate **state) +pf_src_connlimit(struct pf_kstate *state) { struct pf_overload_entry *pfoe; + struct pf_ksrc_node *sn = state->sns[PF_SN_LIMIT]; int bad = 0; + int ret = 1; - PF_STATE_LOCK_ASSERT(*state); - /* - * XXXKS: The src node is accessed unlocked! - * PF_SRC_NODE_LOCK_ASSERT((*state)->src_node); - */ + PF_STATE_LOCK_ASSERT(state); + KASSERT(sn != NULL, + ("pf_src_connlimit: state->sns[PF_SN_LIMIT] == NULL")); - (*state)->src_node->conn++; - (*state)->src.tcp_est = 1; - pf_add_threshold(&(*state)->src_node->conn_rate); + if (!pf_find_src_node_ptr(state->sh, sn)) { + ret = 0; + goto done; + } - if ((*state)->rule.ptr->max_src_conn && - (*state)->rule.ptr->max_src_conn < - (*state)->src_node->conn) { + sn->conn++; + state->src.tcp_est = 1; + pf_add_threshold(&(sn->conn_rate)); + + if (state->rule.ptr->max_src_conn && + state->rule.ptr->max_src_conn < sn->conn) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1); bad++; } - if ((*state)->rule.ptr->max_src_conn_rate.limit && - pf_check_threshold(&(*state)->src_node->conn_rate)) { + if (state->rule.ptr->max_src_conn_rate.limit && + pf_check_threshold(&(sn->conn_rate))) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1); bad++; } - if (!bad) - return (0); + if (!bad) { + ret = 0; + goto done; + } /* Kill this state. */ - (*state)->timeout = PFTM_PURGE; - pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED); + state->timeout = PFTM_PURGE; + pf_set_protostate(state, PF_PEER_BOTH, TCPS_CLOSED); - if ((*state)->rule.ptr->overload_tbl == NULL) - return (1); + if (state->rule.ptr->overload_tbl == NULL) { + ret = 1; + goto done; + } /* Schedule overloading and flushing task. */ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); - if (pfoe == NULL) - return (1); /* too bad :( */ + if (pfoe == NULL) { + ret = 1; /* too bad :( */ + goto done; + } - bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); - pfoe->af = (*state)->key[PF_SK_WIRE]->af; - pfoe->rule = (*state)->rule.ptr; - pfoe->dir = (*state)->direction; + bcopy(&sn->addr, &pfoe->addr, sizeof(pfoe->addr)); + pfoe->af = state->key[PF_SK_WIRE]->af; + pfoe->rule = state->rule.ptr; + pfoe->dir = state->direction; PF_OVERLOADQ_LOCK(); SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); PF_OVERLOADQ_UNLOCK(); taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); - return (1); +done: + PF_HASHROW_UNLOCK(state->sh); + return ret; } static void @@ -961,33 +974,61 @@ CURVNET_RESTORE(); } -/* - * Can return locked on failure, so that we can consistently - * allocate and insert a new one. - */ -struct pf_ksrc_node * -pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af, - struct pf_srchash **sh, bool returnlocked) +u_short +pf_find_src_node(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct pf_addr *src, struct pf_krule *rule, enum pf_sn_types sn_type) { - struct pf_ksrc_node *n; + struct pf_ksrc_node *cur; counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); - *sh = &V_pf_srchash[pf_hashsrc(src, af)]; - PF_HASHROW_LOCK(*sh); - LIST_FOREACH(n, &(*sh)->nodes, entry) - if (n->rule.ptr == rule && n->af == af && - ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || - (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) - break; + /* First search of src node sets ctx->sh and locks it */ + if (ctx->sh == NULL) { + ctx->sh = &V_pf_srchash[pf_hashsrc(src, pd->af)]; + PF_HASHROW_LOCK(ctx->sh); + } else { + PF_HASHROW_ASSERT(ctx->sh); +#ifdef INVARIANTS + if (ctx->sh != &V_pf_srchash[pf_hashsrc(src, pd->af)]) + panic("%s: source node hash mismatch", __func__); +#endif + } - if (n != NULL) { - n->states++; - PF_HASHROW_UNLOCK(*sh); - } else if (returnlocked == false) - PF_HASHROW_UNLOCK(*sh); + LIST_FOREACH(cur, &(ctx->sh->nodes), entry) { + if (cur->rule.ptr == rule && + cur->af == pd->af && + cur->type == sn_type && + ((pd->af == AF_INET && + cur->addr.v4.s_addr == pd->src->v4.s_addr) || + (pd->af == AF_INET6 && bcmp(&(cur->addr), pd->src, + sizeof(*(pd->src))) == 0)) && + cur->expire != 1 + ) { + cur->expire = (rule->timeout[PFTM_SRC_NODE] ? + rule->timeout[PFTM_SRC_NODE] : + V_pf_default_rule.timeout[PFTM_SRC_NODE]) + time_uptime; + ctx->sns[sn_type] = cur; + return (1); + } + } - return (n); + return (0); +} + +static bool +pf_find_src_node_ptr(struct pf_srchash *sh, struct pf_ksrc_node *sn) +{ + struct pf_ksrc_node *cur; + + KASSERT(sh != NULL, ("%s: sh is NULL", __func__)); + + counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); + PF_HASHROW_LOCK(sh); + LIST_FOREACH(cur, &(sh->nodes), entry) { + if (cur == sn && cur->expire != 1) + return true; + } + return false; } static void @@ -1001,81 +1042,122 @@ uma_zfree(V_pf_sources_z, sn); } -static u_short -pf_insert_src_node(struct pf_ksrc_node **sn, struct pf_krule *rule, - struct pf_addr *src, sa_family_t af) +u_short +pf_insert_src_node(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct pf_addr *src, struct pf_krule *rule, enum pf_sn_types sn_type) { - u_short reason = 0; - struct pf_srchash *sh = NULL; + u_short reason = 0; - KASSERT((rule->rule_flag & PFRULE_SRCTRACK || - rule->rpool.opts & PF_POOL_STICKYADDR), + KASSERT((rule->nr == -1 || rule->rule_flag & PFRULE_SRCTRACK || + rule->rpool.opts & PF_POOL_STICKYADDR), ("%s for non-tracking rule %p", __func__, rule)); - if (*sn == NULL) - *sn = pf_find_src_node(src, rule, af, &sh, true); + /* + * This function must be used after pf_find_src_node() + * which will set ctx->sh. + */ + PF_HASHROW_ASSERT(ctx->sh); - if (*sn == NULL) { - PF_HASHROW_ASSERT(sh); + KASSERT(ctx->sns[sn_type] == NULL, ("%s: ctx->sns[%d] not NULL", + __func__, sn_type)); - if (rule->max_src_nodes && - counter_u64_fetch(rule->src_nodes) >= rule->max_src_nodes) { - counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1); - PF_HASHROW_UNLOCK(sh); - reason = PFRES_SRCLIMIT; - goto done; - } + ctx->sns[sn_type] = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); + if (ctx->sns[sn_type] == NULL) { + reason = PFRES_MEMORY; + goto done; + } - (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); - if ((*sn) == NULL) { - PF_HASHROW_UNLOCK(sh); + for (int i = 0; i < 2; i++) { + ctx->sns[sn_type]->bytes[i] = counter_u64_alloc(M_NOWAIT); + ctx->sns[sn_type]->packets[i] = counter_u64_alloc(M_NOWAIT); + + if (ctx->sns[sn_type]->bytes[i] == NULL || + ctx->sns[sn_type]->packets[i] == NULL) { + pf_free_src_node(ctx->sns[sn_type]); reason = PFRES_MEMORY; + ctx->sns[sn_type] = NULL; goto done; } + } - for (int i = 0; i < 2; i++) { - (*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT); - (*sn)->packets[i] = counter_u64_alloc(M_NOWAIT); + pf_init_threshold(&(ctx->sns[sn_type])->conn_rate, + rule->max_src_conn_rate.limit, + rule->max_src_conn_rate.seconds); - if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) { - pf_free_src_node(*sn); - PF_HASHROW_UNLOCK(sh); - reason = PFRES_MEMORY; - goto done; - } - } + MPASS(ctx->sns[sn_type]->lock == NULL); + ctx->sns[sn_type]->lock = &(ctx->sh->lock); - pf_init_threshold(&(*sn)->conn_rate, - rule->max_src_conn_rate.limit, - rule->max_src_conn_rate.seconds); + ctx->sns[sn_type]->af = pd->af; + ctx->sns[sn_type]->rule.ptr = rule; + PF_ACPY(&(ctx->sns[sn_type]->addr), src, pd->af); + LIST_INSERT_HEAD(&(ctx->sh->nodes), ctx->sns[sn_type], entry); + ctx->sns[sn_type]->creation = time_uptime; + ctx->sns[sn_type]->ruletype = rule->action; + ctx->sns[sn_type]->type = sn_type; + ctx->sns[sn_type]->expire = (rule->timeout[PFTM_SRC_NODE] ? + rule->timeout[PFTM_SRC_NODE] : + V_pf_default_rule.timeout[PFTM_SRC_NODE]) + time_uptime; - MPASS((*sn)->lock == NULL); - (*sn)->lock = &sh->lock; + counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); - (*sn)->af = af; - (*sn)->rule.ptr = rule; - PF_ACPY(&(*sn)->addr, src, af); - LIST_INSERT_HEAD(&sh->nodes, *sn, entry); - (*sn)->creation = time_uptime; - (*sn)->ruletype = rule->action; - (*sn)->states = 1; - if ((*sn)->rule.ptr != NULL) - counter_u64_add((*sn)->rule.ptr->src_nodes, 1); - PF_HASHROW_UNLOCK(sh); - counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); - } else { - if (rule->max_src_states && - (*sn)->states >= rule->max_src_states) { - counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], +done: + /* Returns locked */ + return (reason); +} + +static u_short +pf_check_source_limits(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct pf_krule *rule) +{ + struct pf_krule *r_track = rule; + u_short reason = 0; + + /* + * Only src tracking uses this function! Load balancing code uses + * pf_map_addr_sn() which calls the low level functions directly. + */ + KASSERT(rule->rule_flag & PFRULE_SRCTRACK, + ("%s for non-tracking rule %p", __func__, rule)); + + /* + * Rules with global source tracking store the counters of connected + * sources and their states in the default rule. + */ + if (!(rule->rule_flag & PFRULE_RULESRCTRACK)) + r_track = &V_pf_default_rule; + + pf_find_src_node(ctx, pd, pd->src, r_track, PF_SN_LIMIT); + + PF_HASHROW_ASSERT(ctx->sh); /* set by pf_find_src_node() */ + + if (ctx->sns[PF_SN_LIMIT] == NULL) { + if (rule->max_src_nodes && + counter_u64_fetch(r_track->src_nodes) >= rule->max_src_nodes) { + counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1); reason = PFRES_SRCLIMIT; goto done; } + if ((reason = pf_insert_src_node(ctx, pd, pd->src, r_track, PF_SN_LIMIT)) != 0) + goto done; + if (ctx->sns[PF_SN_LIMIT]->rule.ptr != NULL) + counter_u64_add(ctx->sns[PF_SN_LIMIT]->rule.ptr->src_nodes, 1); + } else { + if (rule->max_src_states && + ctx->sns[PF_SN_LIMIT]->states >= rule->max_src_states) { + counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], + 1); + reason = PFRES_MAXSTATES; + goto done; + } } + done: + PF_HASHROW_UNLOCK(ctx->sh); return (reason); } + void pf_unlink_src_node(struct pf_ksrc_node *src) { @@ -2533,32 +2615,33 @@ static void pf_src_tree_remove_state(struct pf_kstate *s) { - struct pf_ksrc_node *sn; - uint32_t timeout; + enum pf_sn_types sn_type; + uint32_t timeout; + + PF_STATE_LOCK_ASSERT(s); timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ? s->rule.ptr->timeout[PFTM_SRC_NODE] : V_pf_default_rule.timeout[PFTM_SRC_NODE]; - if (s->src_node != NULL) { - sn = s->src_node; - PF_SRC_NODE_LOCK(sn); + for (sn_type=0; sn_typesns[sn_type] == NULL || + !pf_find_src_node_ptr(s->sh, s->sns[sn_type]) + ) + continue; + if (s->src.tcp_est) - --sn->conn; - if (--sn->states == 0) - sn->expire = time_uptime + timeout; - PF_SRC_NODE_UNLOCK(sn); + --(s->sns[sn_type]->conn); + /* Last removed state sets the expiry of the src node */ + if (--(s->sns[sn_type]->states) == 0) + s->sns[sn_type]->expire = time_uptime + timeout; + PF_SRC_NODE_UNLOCK(s->sns[sn_type]); + s->sns[sn_type] = NULL; } - if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { - sn = s->nat_src_node; - PF_SRC_NODE_LOCK(sn); - if (--sn->states == 0) - sn->expire = time_uptime + timeout; - PF_SRC_NODE_UNLOCK(sn); - } - s->src_node = s->nat_src_node = NULL; + } + /* * Unlink and potentilly free a state. Function may be * called with ID hash row locked, but always returns @@ -4887,6 +4970,7 @@ struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_krule **am, struct pf_kruleset **rsm, struct inpcb *inp, int hdrlen) { + struct pf_test_ctx ctx; struct pf_krule *nr = NULL; struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; @@ -4898,7 +4982,7 @@ struct pf_ksrc_node *nsn = NULL; struct tcphdr *th = &pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; - u_short reason, transerror; + u_short reason, transerror, sn_reason = 0; int rewrite = 0; int tag = -1; int asd = 0; @@ -4912,6 +4996,7 @@ PF_RULES_RASSERT(); + memset(&ctx, 0, sizeof(ctx)); SLIST_INIT(&match_rules); if (inp != NULL) { @@ -4975,7 +5060,7 @@ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ - transerror = pf_get_translation(pd, m, off, kif, &nsn, &sk, + transerror = pf_get_translation(&ctx, pd, m, off, kif, &nsn, &sk, &nk, saddr, daddr, sport, dport, anchor_stack, &nr, &udp_mapping); switch (transerror) { default: @@ -5308,7 +5393,27 @@ (!state_icmp && (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM)))) { int action; - action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, + + /* + * If this rule has source tracking, find or create a source + * node of PF_SN_LIMIT type. This source node will be used for + * providing connection limit per source. + */ + if (r->rule_flag & PFRULE_SRCTRACK) { + if ((sn_reason = pf_check_source_limits(&ctx, pd, r)) != 0) { + REASON_SET(&reason, sn_reason); + goto cleanup; + } + } + + if (r->max_states && + (counter_u64_fetch(r->states_cur) > r->max_states)) { + counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); + REASON_SET(&reason, PFRES_MAXSTATES); + goto cleanup; + } + + action = pf_create_state(&ctx, r, nr, a, pd, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, hdrlen, &match_rules, udp_mapping); if (action != PF_PASS) { @@ -5362,42 +5467,22 @@ } static int -pf_create_state(struct pf_krule *r, struct pf_krule *nr, struct pf_krule *a, - struct pf_pdesc *pd, struct pf_ksrc_node *nsn, struct pf_state_key *nk, - struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, - u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm, - int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, - struct pf_krule_slist *match_rules, struct pf_udp_mapping *udp_mapping) +pf_create_state(struct pf_test_ctx *ctx, struct pf_krule *r, + struct pf_krule *nr, struct pf_krule *a, + struct pf_pdesc *pd, struct pf_state_key *nk, struct pf_state_key *sk, + struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, + struct pfi_kkif *kif, struct pf_kstate **sm, int tag, u_int16_t bproto_sum, + u_int16_t bip_sum, int hdrlen, struct pf_krule_slist *match_rules, + struct pf_udp_mapping *udp_mapping) { struct pf_kstate *s = NULL; - struct pf_ksrc_node *sn = NULL; struct tcphdr *th = &pd->hdr.tcp; u_int16_t mss = V_tcp_mssdflt; - u_short reason, sn_reason; + u_short reason; struct pf_krule_item *ri; + enum pf_sn_types sn_type; - /* check maximums */ - if (r->max_states && - (counter_u64_fetch(r->states_cur) >= r->max_states)) { - counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); - REASON_SET(&reason, PFRES_MAXSTATES); - goto csfailed; - } - /* src node for filter rule */ - if ((r->rule_flag & PFRULE_SRCTRACK || - r->rpool.opts & PF_POOL_STICKYADDR) && - (sn_reason = pf_insert_src_node(&sn, r, pd->src, pd->af)) != 0) { - REASON_SET(&reason, sn_reason); - goto csfailed; - } - /* src node for translation rule */ - if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && - (sn_reason = pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], - pd->af)) != 0 ) { - REASON_SET(&reason, sn_reason); - goto csfailed; - } - s = pf_alloc_state(M_NOWAIT); + s = pf_alloc_state(M_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto csfailed; @@ -5486,21 +5571,46 @@ if (r->rt) { /* pf_map_addr increases the reason counters */ - if ((reason = pf_map_addr_sn(pd->af, r, pd->src, &s->rt_addr, - &s->rt_kif, NULL, &sn)) != 0) + if ((reason = pf_map_addr_sn(ctx, pd, r, pd->src, &s->rt_addr, + &s->rt_kif, NULL, PF_SN_ROUTE)) != 0) goto csfailed; s->rt = r->rt; } s->creation = s->expire = pf_get_uptime(); - if (sn != NULL) - s->src_node = sn; - if (nsn != NULL) { - /* XXX We only modify one side for now. */ - PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); - s->nat_src_node = nsn; + /* + * Source nodes might have been inserted or found by: + * - NAT rules: pf_test() -> pf_get_translation() -> pf_map_addr_sn() + * - route-to rules: pf_map_addr_sn() just above + * - source limit tracking: pf_check_source_limits() before pf_create_state() + * + * Each of those operations sets ctx->sns[sn_type], so that we can use + * them now. However, those operations unlock the hash row ctx->sh. + * We must check if maybe source nodes have been removed in the meantime. + */ + s->sh = ctx->sh; + for (sn_type=0; sn_typesns[sn_type] == NULL || + !pf_find_src_node_ptr(ctx->sh, ctx->sns[sn_type]) + ) + continue; + + if (sn_type == PF_SN_NAT) { + /* XXX We only modify one side for now. */ + PF_ACPY(&(ctx->sns[sn_type]->raddr), + &nk->addr[1], pd->af); + } + + ctx->sns[sn_type]->states++; + s->sns[sn_type] = ctx->sns[sn_type]; + PF_HASHROW_UNLOCK(ctx->sh); } + if (pd->proto == IPPROTO_TCP) { if (s->state_flags & PFSTATE_SCRUB_TCP && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { @@ -5597,27 +5707,7 @@ uma_zfree(V_pf_state_key_z, sk); uma_zfree(V_pf_state_key_z, nk); - if (sn != NULL) { - PF_SRC_NODE_LOCK(sn); - if (--sn->states == 0 && sn->expire == 0) { - pf_unlink_src_node(sn); - uma_zfree(V_pf_sources_z, sn); - counter_u64_add( - V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); - } - PF_SRC_NODE_UNLOCK(sn); - } - - if (nsn != sn && nsn != NULL) { - PF_SRC_NODE_LOCK(nsn); - if (--nsn->states == 0 && nsn->expire == 0) { - pf_unlink_src_node(nsn); - uma_zfree(V_pf_sources_z, nsn); - counter_u64_add( - V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); - } - PF_SRC_NODE_UNLOCK(nsn); - } + /* XXX KS: re-introduce src node removal */ drop: if (s != NULL) { @@ -5831,8 +5921,8 @@ pf_set_protostate(*state, pdst, TCPS_ESTABLISHED); if (src->state == TCPS_ESTABLISHED && - (*state)->src_node != NULL && - pf_src_connlimit(state)) { + (*state)->sns[PF_SN_LIMIT] != NULL && + pf_src_connlimit(*state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } @@ -6002,8 +6092,8 @@ if (dst->state == TCPS_SYN_SENT) { pf_set_protostate(*state, pdst, TCPS_ESTABLISHED); if (src->state == TCPS_ESTABLISHED && - (*state)->src_node != NULL && - pf_src_connlimit(state)) { + (*state)->sns[PF_SN_LIMIT] && + pf_src_connlimit(*state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } @@ -6020,8 +6110,8 @@ pf_set_protostate(*state, PF_PEER_BOTH, TCPS_ESTABLISHED); dst->state = src->state = TCPS_ESTABLISHED; - if ((*state)->src_node != NULL && - pf_src_connlimit(state)) { + if ((*state)->sns[PF_SN_LIMIT] != NULL && + pf_src_connlimit(*state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } @@ -6087,8 +6177,8 @@ (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); - } else if ((*state)->src_node != NULL && - pf_src_connlimit(state)) { + } else if ((*state)->sns[PF_SN_LIMIT] != NULL && + pf_src_connlimit(*state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else @@ -7790,7 +7880,7 @@ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } - pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, + pf_map_addr(pd, r, (struct pf_addr *)&ip->ip_src, &naddr, &nkif, NULL); if (!PF_AZERO(&naddr, AF_INET)) dst.sin_addr.s_addr = naddr.v4.s_addr; @@ -8038,7 +8128,7 @@ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } - pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, + pf_map_addr(pd, r, (struct pf_addr *)&ip6->ip6_src, &naddr, &nkif, NULL); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, @@ -8802,17 +8892,21 @@ pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx], pd->tot_len); } - if (s->src_node != NULL) { - counter_u64_add(s->src_node->packets[dirndx], - 1); - counter_u64_add(s->src_node->bytes[dirndx], - pd->tot_len); - } - if (s->nat_src_node != NULL) { - counter_u64_add(s->nat_src_node->packets[dirndx], - 1); - counter_u64_add(s->nat_src_node->bytes[dirndx], - pd->tot_len); + /* + * Source nodes are accessed unlocked here. + * But since we are operating with stateful tracking + * and the state is locked, those SNs could not have + * been freed. + */ + for (enum pf_sn_types sn_type=0; sn_typesns[sn_type] != NULL) { + counter_u64_add( + s->sns[sn_type]->packets[dirndx], + 1); + counter_u64_add( + s->sns[sn_type]->bytes[dirndx], + pd->tot_len); + } } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; @@ -8924,12 +9018,28 @@ switch (af) { #ifdef INET case AF_INET: + /* Packet normalization, reassembly, legacy scrub ruleset */ + if (pf_normalize_ip(&m, kif, reason, pd) != PF_PASS) { + if (m == NULL) { + /* if packet sits in reassembly queue, return without error */ + return PF_PASS; + } + return PF_DROP; + } h = mtod(m, struct ip *); ttl = h->ip_ttl; break; #endif #ifdef INET6 case AF_INET6: + /* Packet normalization, reassembly, legacy scrub ruleset */ + if (pf_normalize_ip6(&m, kif, reason, pd) != PF_PASS) { + if (m == NULL) { + /* if packet sits in reassembly queue, return without error */ + return PF_PASS; + } + return PF_DROP; + } h6 = mtod(m, struct ip6_hdr *); ttl = h6->ip6_hlim; break; diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -5548,6 +5548,7 @@ void pfsync_state_export(union pfsync_state_union *sp, struct pf_kstate *st, int msg_version) { + enum pf_sn_types sn_type; bzero(sp, sizeof(union pfsync_state_union)); /* copy from state key */ @@ -5603,10 +5604,14 @@ __func__, msg_version); } - if (st->src_node) - sp->pfs_1301.sync_flags |= PFSYNC_FLAG_SRCNODE; - if (st->nat_src_node) - sp->pfs_1301.sync_flags |= PFSYNC_FLAG_NATSRCNODE; + for (sn_type=0; sn_typesns[sn_type] == NULL) + continue; + if (sn_type == PF_SN_LIMIT || sn_type == PF_SN_ROUTE) + sp->pfs_1301.sync_flags |= PFSYNC_FLAG_SRCNODE; + else + sp->pfs_1301.sync_flags |= PFSYNC_FLAG_NATSRCNODE; + } sp->pfs_1301.id = st->id; sp->pfs_1301.creatorid = st->creatorid; @@ -5635,6 +5640,7 @@ void pf_state_export(struct pf_state_export *sp, struct pf_kstate *st) { + enum pf_sn_types sn_type; bzero(sp, sizeof(*sp)); sp->version = PF_STATE_VERSION; @@ -5669,10 +5675,14 @@ /* 8 bits for the old libpfctl, 16 bits for the new libpfctl */ sp->state_flags_compat = st->state_flags; sp->state_flags = htons(st->state_flags); - if (st->src_node) - sp->sync_flags |= PFSYNC_FLAG_SRCNODE; - if (st->nat_src_node) - sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE; + for (sn_type=0; sn_typesns[sn_type] == NULL) + continue; + if (sn_type == PF_SN_LIMIT || sn_type == PF_SN_ROUTE) + sp->sync_flags |= PFSYNC_FLAG_SRCNODE; + else + sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE; + } sp->id = st->id; sp->creatorid = st->creatorid; @@ -5909,6 +5919,7 @@ struct pf_kstate *s; struct pf_srchash *sh; struct pf_ksrc_node *sn; + enum pf_sn_types sn_type; int i; for (i = 0; i <= V_pf_hashmask; i++) { @@ -5916,8 +5927,9 @@ PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { - s->src_node = NULL; - s->nat_src_node = NULL; + for (sn_type=0; sn_typesns[sn_type] = NULL; + } } PF_HASHROW_UNLOCK(ih); } @@ -5937,6 +5949,7 @@ pf_kill_srcnodes(struct pfioc_src_node_kill *psnk) { struct pf_ksrc_node_list kill; + enum pf_sn_types sn_type; LIST_INIT(&kill); for (int i = 0; i <= V_pf_srchashmask; i++) { @@ -5966,10 +5979,16 @@ PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { - if (s->src_node && s->src_node->expire == 1) - s->src_node = NULL; - if (s->nat_src_node && s->nat_src_node->expire == 1) - s->nat_src_node = NULL; + for(sn_type=0; sn_typeexpire, the SN + * has been unlinked while it was locked. + */ + if (s->sns[sn_type] && + s->sns[sn_type]->expire == 1) { + s->sns[sn_type] = NULL; + } + } } PF_HASHROW_UNLOCK(ih); } diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -67,10 +67,11 @@ int, struct pfi_kkif *, struct pf_addr *, u_int16_t, struct pf_addr *, uint16_t, int, struct pf_kanchor_stackframe *); -static int pf_get_sport(sa_family_t, uint8_t, struct pf_krule *, - struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *, - uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **, - struct pf_udp_mapping **); +static int pf_get_sport(struct pf_test_ctx *, struct pf_pdesc *, + struct pf_krule *, struct pf_addr *, uint16_t, + struct pf_addr *, uint16_t, struct pf_addr *, + uint16_t *, uint16_t, uint16_t, enum pf_sn_types, + struct pf_udp_mapping **); static bool pf_islinklocal(const sa_family_t, const struct pf_addr *); #define mix(a,b,c) \ @@ -222,15 +223,14 @@ } static int -pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r, +pf_get_sport(struct pf_test_ctx *ctx, struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low, - uint16_t high, struct pf_ksrc_node **sn, + uint16_t high, enum pf_sn_types sn_type, struct pf_udp_mapping **udp_mapping) { struct pf_state_key_cmp key; struct pf_addr init_addr; - struct pf_srchash *sh = NULL; bzero(&init_addr, sizeof(init_addr)); @@ -241,33 +241,31 @@ * from the mapping. In this case we have to look up the src_node as * pf_map_addr would. */ - if (proto == IPPROTO_UDP && (r->rpool.opts & PF_POOL_ENDPI)) { + if (pd->proto == IPPROTO_UDP && (r->rpool.opts & PF_POOL_ENDPI)) { struct pf_udp_endpoint_cmp udp_source; bzero(&udp_source, sizeof(udp_source)); - udp_source.af = af; - PF_ACPY(&udp_source.addr, saddr, af); + udp_source.af = pd->af; + PF_ACPY(&udp_source.addr, saddr, pd->af); udp_source.port = sport; *udp_mapping = pf_udp_mapping_find(&udp_source); if (*udp_mapping) { - PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, af); + PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, pd->af); *nport = (*udp_mapping)->endpoints[1].port; - /* Try to find a src_node as per pf_map_addr(). */ - if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && - (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(saddr, r, af, &sh, 0); + pf_map_addr_sn(ctx, pd, r, saddr, naddr, NULL, &init_addr, sn_type); return (0); } else { - *udp_mapping = pf_udp_mapping_create(af, saddr, sport, &init_addr, 0); + *udp_mapping = pf_udp_mapping_create(pd->af, saddr, + sport, &init_addr, 0); if (*udp_mapping == NULL) return (1); } } - if (pf_map_addr_sn(af, r, saddr, naddr, NULL, &init_addr, sn)) + if (pf_map_addr_sn(ctx, pd, r, saddr, naddr, NULL, &init_addr, sn_type)) goto failed; - if (proto == IPPROTO_ICMP) { + if (pd->proto == IPPROTO_ICMP) { if (*nport == htons(ICMP_ECHO)) { low = 1; high = 65535; @@ -275,7 +273,7 @@ return (0); /* Don't try to modify non-echo ICMP */ } #ifdef INET6 - if (proto == IPPROTO_ICMPV6) { + if (pd->proto == IPPROTO_ICMPV6) { if (*nport == htons(ICMP6_ECHO_REQUEST)) { low = 1; high = 65535; @@ -285,21 +283,21 @@ #endif /* INET6 */ bzero(&key, sizeof(key)); - key.af = af; - key.proto = proto; + key.af = pd->af; + key.proto = pd->proto; key.port[0] = dport; PF_ACPY(&key.addr[0], daddr, key.af); do { PF_ACPY(&key.addr[1], naddr, key.af); if (*udp_mapping) - PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, af); + PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, pd->af); /* * port search; start random, step; * similar 2 portloop in in_pcbbind */ - if (proto == IPPROTO_SCTP) { + if (pd->proto == IPPROTO_SCTP) { key.port[1] = sport; if (!pf_find_state_all_exists(&key, PF_IN)) { *nport = sport; @@ -307,8 +305,8 @@ } else { return (1); /* Fail mapping. */ } - } else if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || - proto == IPPROTO_ICMP) || (low == 0 && high == 0)) { + } else if (!(pd->proto == IPPROTO_TCP || pd->proto == IPPROTO_UDP || + pd->proto == IPPROTO_ICMP) || (low == 0 && high == 0)) { /* * XXX bug: icmp states don't use the id on both sides. * (traceroute -I through nat) @@ -361,7 +359,7 @@ } tmp = cut; for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) { - if (proto == IPPROTO_UDP && + if (pd->proto == IPPROTO_UDP && (r->rpool.opts & PF_POOL_ENDPI)) { (*udp_mapping)->endpoints[1].port = htons(tmp); if (pf_udp_mapping_insert(*udp_mapping) == 0) { @@ -385,7 +383,8 @@ * pick a different source address since we're out * of free port choices for the current one. */ - if (pf_map_addr_sn(af, r, saddr, naddr, NULL, &init_addr, sn)) + if (pf_map_addr_sn(ctx, pd, r, saddr, naddr, NULL, + &init_addr, sn_type)) return (1); break; case PF_POOL_NONE: @@ -394,7 +393,7 @@ default: return (1); } - } while (! PF_AEQ(&init_addr, naddr, af) ); + } while (! PF_AEQ(&init_addr, naddr, pd->af) ); failed: uma_zfree(V_pf_udp_mapping_z, *udp_mapping); @@ -411,10 +410,11 @@ } static int -pf_get_mape_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r, - struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, - uint16_t dport, struct pf_addr *naddr, uint16_t *nport, - struct pf_ksrc_node **sn, struct pf_udp_mapping **udp_mapping) +pf_get_mape_sport(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct pf_krule *r, struct pf_addr *saddr, uint16_t sport, + struct pf_addr *daddr, uint16_t dport, struct pf_addr *naddr, + uint16_t *nport, enum pf_sn_types sn_type, + struct pf_udp_mapping **udp_mapping) { uint16_t psmask, low, highmask; uint16_t i, ahigh, cut; @@ -433,21 +433,21 @@ for (i = cut; i <= ahigh; i++) { low = (i << ashift) | psmask; - if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport, - naddr, nport, low, low | highmask, sn, udp_mapping)) + if (!pf_get_sport(ctx, pd, r, saddr, sport, daddr, dport, + naddr, nport, low, low | highmask, sn_type, udp_mapping)) return (0); } for (i = cut - 1; i > 0; i--) { low = (i << ashift) | psmask; - if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport, - naddr, nport, low, low | highmask, sn, udp_mapping)) + if (!pf_get_sport(ctx, pd, r, saddr, sport, daddr, dport, + naddr, nport, low, low | highmask, sn_type, udp_mapping)) return (0); } return (1); } u_short -pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, +pf_map_addr(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *saddr, struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr) { u_short reason = PFRES_MATCH; @@ -462,7 +462,7 @@ goto done_pool_mtx; } if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - switch (af) { + switch (pd->af) { #ifdef INET case AF_INET: if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && @@ -500,14 +500,14 @@ switch (rpool->opts & PF_POOL_TYPEMASK) { case PF_POOL_NONE: - PF_ACPY(naddr, raddr, af); + PF_ACPY(naddr, raddr, pd->af); break; case PF_POOL_BITMASK: - PF_POOLMASK(naddr, raddr, rmask, saddr, af); + PF_POOLMASK(naddr, raddr, rmask, saddr, pd->af); break; case PF_POOL_RANDOM: - if (init_addr != NULL && PF_AZERO(init_addr, af)) { - switch (af) { + if (init_addr != NULL && PF_AZERO(init_addr, pd->af)) { + switch (pd->af) { #ifdef INET case AF_INET: rpool->counter.addr32[0] = htonl(arc4random()); @@ -536,20 +536,20 @@ break; #endif /* INET6 */ } - PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); - PF_ACPY(init_addr, naddr, af); + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, pd->af); + PF_ACPY(init_addr, naddr, pd->af); } else { - PF_AINC(&rpool->counter, af); - PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + PF_AINC(&rpool->counter, pd->af); + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, pd->af); } break; case PF_POOL_SRCHASH: { unsigned char hash[16]; - pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); - PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); + pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, pd->af); + PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, pd->af); break; } case PF_POOL_ROUNDROBIN: @@ -558,13 +558,13 @@ if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (!pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) + &rpool->tblidx, &rpool->counter, pd->af, NULL)) goto get_addr; } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) + &rpool->tblidx, &rpool->counter, pd->af, pf_islinklocal)) goto get_addr; - } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) + } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, pd->af)) goto get_addr; try_next: @@ -575,7 +575,7 @@ if (rpool->cur->addr.type == PF_ADDR_TABLE) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + &rpool->tblidx, &rpool->counter, pd->af, NULL)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -585,7 +585,7 @@ } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) { + &rpool->tblidx, &rpool->counter, pd->af, pf_islinklocal)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -595,14 +595,14 @@ } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; - PF_ACPY(&rpool->counter, raddr, af); + PF_ACPY(&rpool->counter, raddr, pd->af); } get_addr: - PF_ACPY(naddr, &rpool->counter, af); - if (init_addr != NULL && PF_AZERO(init_addr, af)) - PF_ACPY(init_addr, naddr, af); - PF_AINC(&rpool->counter, af); + PF_ACPY(naddr, &rpool->counter, pd->af); + if (init_addr != NULL && PF_AZERO(init_addr, pd->af)) + PF_ACPY(init_addr, naddr, pd->af); + PF_AINC(&rpool->counter, pd->af); break; } } @@ -621,45 +621,50 @@ } u_short -pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, - struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr, - struct pf_ksrc_node **sn) +pf_map_addr_sn(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct pf_krule *r, struct pf_addr *saddr, struct pf_addr *naddr, + struct pfi_kkif **nkif, struct pf_addr *init_addr, + enum pf_sn_types sn_type) { - u_short reason = 0; struct pf_kpool *rpool = &r->rpool; - struct pf_srchash *sh = NULL; + u_short reason = 0; - /* Try to find a src_node if none was given and this - is a sticky-address rule. */ - if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && + /* + * Try to find an existing src_node if none was given and this is a + * sticky-address rule. + */ + if (r->rpool.opts & PF_POOL_STICKYADDR && (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(saddr, r, af, &sh, false); + pf_find_src_node(ctx, pd, saddr, r, sn_type); - /* If a src_node was found or explicitly given and it has a non-zero - route address, use this address. A zeroed address is found if the - src node was created just a moment ago in pf_create_state and it - needs to be filled in with routing decision calculated here. */ - if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { - /* If the supplied address is the same as the current one we've + /* + * If source node has been found it can be used. + */ + if (ctx->sns[sn_type] != NULL) { + /* + * If the supplied address is the same as the current one we've * been asked before, so tell the caller that there's no other - * address to be had. */ - if (PF_AEQ(naddr, &(*sn)->raddr, af)) { + * address to be had. + */ + if (PF_AEQ(naddr, &(ctx->sns[sn_type]->raddr), pd->af)) { reason = PFRES_MAPFAILED; goto done; } - PF_ACPY(naddr, &(*sn)->raddr, af); + PF_ACPY(naddr, &(ctx->sns[sn_type]->raddr), pd->af); if (nkif) - *nkif = (*sn)->rkif; + *nkif = ctx->sns[sn_type]->rkif; if (V_pf_status.debug >= PF_DEBUG_NOISY) { printf("pf_map_addr: src tracking maps "); - pf_print_host(saddr, 0, af); + pf_print_host(saddr, 0, pd->af); printf(" to "); - pf_print_host(naddr, 0, af); + pf_print_host(naddr, 0, pd->af); if (nkif) printf("@%s", (*nkif)->pfik_name); printf("\n"); } + + reason = 0; goto done; } @@ -667,37 +672,48 @@ * Source node has not been found. Find a new address and store it * in variables given by the caller. */ - if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr) != 0) { + if (pf_map_addr(pd, r, saddr, naddr, nkif, init_addr) != 0) { /* pf_map_addr() sets reason counters on its own */ goto done; } - if (*sn != NULL) { - PF_ACPY(&(*sn)->raddr, naddr, af); - if (nkif) - (*sn)->rkif = *nkif; - } - - if (V_pf_status.debug >= PF_DEBUG_NOISY && - (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + if (V_pf_status.debug >= PF_DEBUG_NOISY) { printf("pf_map_addr: selected address "); - pf_print_host(naddr, 0, af); + pf_print_host(naddr, 0, pd->af); if (nkif) printf("@%s", (*nkif)->pfik_name); printf("\n"); } + /* + * Now we can allocate and insert a new src node. The new address + * is copied from the variables given by the caller, which have been + * set by pf_map_addr(). + */ + if (ctx->sh != NULL) { + if (nkif) + *nkif = rpool->cur->kif; + + if ((reason = pf_insert_src_node(ctx, pd, saddr, r, sn_type)) != 0) + goto done; + + PF_ACPY(&(ctx->sns[sn_type]->raddr), naddr, pd->af); + } + done: if (reason) { counter_u64_add(V_pf_status.counters[reason], 1); } + if (ctx->sh) + PF_HASHROW_UNLOCK(ctx->sh); + return (reason); } u_short -pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, - struct pfi_kkif *kif, struct pf_ksrc_node **sn, +pf_get_translation(struct pf_test_ctx *ctx, struct pf_pdesc *pd, + struct mbuf *m, int off, struct pfi_kkif *kif, struct pf_ksrc_node **sn, struct pf_state_key **skp, struct pf_state_key **nkp, struct pf_addr *saddr, struct pf_addr *daddr, uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack, @@ -765,8 +781,8 @@ high = r->rpool.proxy_port[1]; } if (r->rpool.mape.offset > 0) { - if (pf_get_mape_sport(pd->af, pd->proto, r, saddr, - sport, daddr, dport, naddr, nportp, sn, udp_mapping)) { + if (pf_get_mape_sport(ctx, pd, r, saddr, sport, daddr, + dport, naddr, nportp, PF_SN_NAT, udp_mapping)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: MAP-E port allocation (%u/%u/%u)" " failed\n", @@ -776,8 +792,8 @@ reason = PFRES_MAPFAILED; goto notrans; } - } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, - daddr, dport, naddr, nportp, low, high, sn, udp_mapping)) { + } else if (pf_get_sport(ctx, pd, r, saddr, sport, daddr, dport, + naddr, nportp, low, high, PF_SN_NAT, udp_mapping)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); @@ -864,7 +880,8 @@ int tries; uint16_t cut, low, high, nport; - reason = pf_map_addr_sn(pd->af, r, saddr, naddr, NULL, NULL, sn); + reason = pf_map_addr_sn(ctx, pd, r, saddr, naddr, NULL, NULL, + PF_SN_NAT); if (reason != 0) goto notrans; if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -146,7 +146,8 @@ struct nl_writer *nw = npt->nw; int error = 0; int af; - struct pf_state_key *key; + struct pf_state_key *key; + enum pf_sn_types sn_type; PF_STATE_LOCK_ASSERT(s); @@ -184,10 +185,14 @@ nlattr_add_u8(nw, PF_ST_TIMEOUT, s->timeout); nlattr_add_u16(nw, PF_ST_STATE_FLAGS, s->state_flags); uint8_t sync_flags = 0; - if (s->src_node) - sync_flags |= PFSYNC_FLAG_SRCNODE; - if (s->nat_src_node) - sync_flags |= PFSYNC_FLAG_NATSRCNODE; + for (sn_type=0; sn_typesns[sn_type] != NULL) + continue; + if (sn_type == PF_SN_LIMIT || sn_type == PF_SN_ROUTE) + sync_flags |= PFSYNC_FLAG_SRCNODE; + else + sync_flags |= PFSYNC_FLAG_NATSRCNODE; + } nlattr_add_u8(nw, PF_ST_SYNC_FLAGS, sync_flags); nlattr_add_u64(nw, PF_ST_ID, s->id); nlattr_add_u32(nw, PF_ST_CREATORID, htonl(s->creatorid)); diff --git a/sys/netpfil/pf/pf_nv.c b/sys/netpfil/pf/pf_nv.c --- a/sys/netpfil/pf/pf_nv.c +++ b/sys/netpfil/pf/pf_nv.c @@ -928,8 +928,9 @@ nvlist_t * pf_state_to_nvstate(const struct pf_kstate *s) { - nvlist_t *nvl, *tmp; - uint32_t expire, flags = 0; + nvlist_t *nvl, *tmp; + uint32_t expire, flags = 0; + enum pf_sn_types sn_type; nvl = nvlist_create(0); if (nvl == NULL) @@ -993,10 +994,14 @@ nvlist_add_number(nvl, "creatorid", s->creatorid); nvlist_add_number(nvl, "direction", s->direction); nvlist_add_number(nvl, "state_flags", s->state_flags); - if (s->src_node) - flags |= PFSYNC_FLAG_SRCNODE; - if (s->nat_src_node) - flags |= PFSYNC_FLAG_NATSRCNODE; + for (sn_type=0; sn_typesns[sn_type] == NULL) + continue; + if (sn_type == PF_SN_LIMIT || sn_type == PF_SN_ROUTE) + flags |= PFSYNC_FLAG_SRCNODE; + else + flags |= PFSYNC_FLAG_NATSRCNODE; + } nvlist_add_number(nvl, "sync_flags", flags); return (nvl); diff --git a/tests/sys/netpfil/pf/Makefile b/tests/sys/netpfil/pf/Makefile --- a/tests/sys/netpfil/pf/Makefile +++ b/tests/sys/netpfil/pf/Makefile @@ -6,6 +6,7 @@ ATF_TESTS_SH+= altq \ anchor \ + counters \ debug \ divert-to \ dup \ diff --git a/tests/sys/netpfil/pf/counters.sh b/tests/sys/netpfil/pf/counters.sh new file mode 100755 --- /dev/null +++ b/tests/sys/netpfil/pf/counters.sh @@ -0,0 +1,83 @@ +# +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2024 Kajetan Staszkiewicz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. + +. $(atf_get_srcdir)/utils.subr + +atf_test_case "source_track" "cleanup" + +counters_head() +{ + atf_set descr 'Rule and table counters' + atf_set require.user root +} + +counters_body() +{ + setup_router_server_ipv6 + + # Clients will connect from another network behind the router. + # This allows for using multiple source addresses and for tester jail + # to not respond with RST packets for SYN+ACKs. + jexec router route add -6 2001:db8:44::0/64 2001:db8:42::2 + jexec server route add -6 2001:db8:44::0/64 2001:db8:43::1 + + pft_set_rules router \ + "table { 2001:db8:44::0/64 }" \ + "nat on ${epair_server}a inet6 from to any -> ${epair_server}a" \ + "block" \ + "pass inet6 proto icmp6 icmp6-type { neighbrsol, neighbradv }" \ + "pass in on ${epair_tester}b inet6 proto tcp from keep state" \ + "pass out on ${epair_server}a inet6 proto tcp keep state" + + # Use the 3-way testing so that more than 1 packet is sent. + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4201 --fromaddr 2001:db8:44::1 + + states=$(mktemp) || exit 1 + jexec router pfctl -qvss > ${states} + echo === states === + cat $states + + nodes=$(mktemp) || exit 1 + jexec router pfctl -qvsS > ${nodes} + echo === nodes === + cat $nodes + + tables=$(mktemp) || exit 1 + jexec router pfctl -qvsT > ${tables} + echo === tables === + cat $tables +} + +counters_cleanup() +{ + pft_cleanup +} + + +atf_init_test_cases() +{ + atf_add_test_case "counters" +} diff --git a/tests/sys/netpfil/pf/src_track.sh b/tests/sys/netpfil/pf/src_track.sh --- a/tests/sys/netpfil/pf/src_track.sh +++ b/tests/sys/netpfil/pf/src_track.sh @@ -150,28 +150,34 @@ # 2 connections from host ::1 matching rule_A will be allowed, 1 will fail to create a state. ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4211 --fromaddr 2001:db8:44::1 ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4212 --fromaddr 2001:db8:44::1 - ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4213 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4213 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4214 --fromaddr 2001:db8:44::1 # 2 connections from host ::1 matching rule_B will be allowed, 1 will fail to create a state. # Limits from rule_A don't interfere with rule_B. ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4221 --fromaddr 2001:db8:44::1 ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4222 --fromaddr 2001:db8:44::1 - ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4223 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4223 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4224 --fromaddr 2001:db8:44::1 # 2 connections from host ::2 matching rule_B will be allowed, 1 will fail to create a state. # Limits for host ::1 will not interfere with host ::2. - ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4224 --fromaddr 2001:db8:44::2 ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4225 --fromaddr 2001:db8:44::2 - ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4226 --fromaddr 2001:db8:44::2 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4226 --fromaddr 2001:db8:44::2 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4227 --fromaddr 2001:db8:44::2 + ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4228 --fromaddr 2001:db8:44::2 # We will check the resulting source nodes, though. # Order of source nodes in output is not guaranteed, find each one separately. nodes=$(mktemp) || exit 1 jexec router pfctl -qvsS > $nodes + echo === Nodes start === + cat $nodes + echo === Nodes end === for node_regexp in \ - '2001:db8:44::1 -> :: \( states 2, connections 2, rate [0-9/\.]+s \)\s+age [0-9:]+, 6 pkts, [0-9]+ bytes, filter rule 3' \ - '2001:db8:44::1 -> :: \( states 2, connections 2, rate [0-9/\.]+s \)\s+age [0-9:]+, 6 pkts, [0-9]+ bytes, filter rule 4' \ - '2001:db8:44::2 -> :: \( states 2, connections 2, rate [0-9/\.]+s \)\s+age [0-9:]+, 6 pkts, [0-9]+ bytes, filter rule 4' \ + '2001:db8:44::1 -> :: \( states 3, connections 3, rate [0-9/\.]+s \)\s+age [0-9:]+, 9 pkts, [0-9]+ bytes, filter rule 3' \ + '2001:db8:44::1 -> :: \( states 3, connections 3, rate [0-9/\.]+s \)\s+age [0-9:]+, 9 pkts, [0-9]+ bytes, filter rule 4' \ + '2001:db8:44::2 -> :: \( states 3, connections 3, rate [0-9/\.]+s \)\s+age [0-9:]+, 9 pkts, [0-9]+ bytes, filter rule 4' \ ; do cat $nodes | tr '\n' ' ' | grep -qE "$node_regexp" || atf_fail "Source nodes not matching expected output" done @@ -185,9 +191,58 @@ pft_cleanup } +max_src_states_global_head() +{ + atf_set descr 'Max states per source global' + atf_set require.user root +} + +max_src_states_global_body() +{ + setup_router_server_ipv6 + + # Clients will connect from another network behind the router. + # This allows for using multiple source addresses and for tester jail + # to not respond with RST packets for SYN+ACKs. + jexec router route add -6 2001:db8:44::0/64 2001:db8:42::2 + jexec server route add -6 2001:db8:44::0/64 2001:db8:43::1 + + pft_set_rules router \ + "block" \ + "pass inet6 proto icmp6 icmp6-type { neighbrsol, neighbradv }" \ + "pass in on ${epair_tester}b inet6 proto tcp from port 4210:4219 keep state (max-src-states 3 source-track global) label rule_A" \ + "pass in on ${epair_tester}b inet6 proto tcp from port 4220:4229 keep state (max-src-states 3 source-track global) label rule_B" \ + "pass out on ${epair_server}a keep state" + + # Global source tracking creates a single source node shared between all + # rules for each connecting source IP address and counts states created + # by all rules. Each rule has its own max-src-conn value checked against + # that single source node. + + # 3 connections from host ::1 matching rule_A will be allowed. + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4211 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4212 --fromaddr 2001:db8:44::1 + ping_server_check_reply exit:0 --ping-type=tcp3way --send-sport=4213 --fromaddr 2001:db8:44::1 + # The 4th connection matching rule_A from host ::1 will have its state killed. + ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4214 --fromaddr 2001:db8:44::1 + # A connection matching rule_B from host ::1 will have its state killed too. + ping_server_check_reply exit:1 --ping-type=tcp3way --send-sport=4221 --fromaddr 2001:db8:44::1 + + nodes=$(mktemp) || exit 1 + jexec router pfctl -qvsS > $nodes + node_regexp='2001:db8:44::1 -> :: \( states 3, connections 3, rate [0-9/\.]+s \)\s+age [0-9:]+, 9 pkts, [0-9]+ bytes' + cat $nodes | tr '\n' ' ' | grep -qE "$node_regexp" || atf_fail "Source nodes not matching expected output" +} + +max_src_states_global_cleanup() +{ + pft_cleanup +} + atf_init_test_cases() { atf_add_test_case "source_track" atf_add_test_case "max_src_conn_rule" atf_add_test_case "max_src_states_rule" + atf_add_test_case "max_src_states_global" }