Index: head/sys/contrib/pf/net/pf.c =================================================================== --- head/sys/contrib/pf/net/pf.c (revision 178284) +++ head/sys/contrib/pf/net/pf.c (revision 178285) @@ -1,7568 +1,7568 @@ /* $OpenBSD: pf.c,v 1.527 2007/02/22 15:23:23 pyr Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002,2003 Henning Brauer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * */ #ifdef __FreeBSD__ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #endif #ifdef __FreeBSD__ #include "opt_mac.h" #include "opt_bpf.h" #include "opt_pf.h" #ifdef DEV_BPF #define NBPFILTER DEV_BPF #else #define NBPFILTER 0 #endif #ifdef DEV_PFLOG #define NPFLOG DEV_PFLOG #else #define NPFLOG 0 #endif #ifdef DEV_PFSYNC #define NPFSYNC DEV_PFSYNC #else #define NPFSYNC 0 #endif #else #include "bpfilter.h" #include "pflog.h" #include "pfsync.h" #endif #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #include #include #else #include #endif #include #ifdef __FreeBSD__ #include #include #include #else #include #endif #include #include #include #include #ifndef __FreeBSD__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __FreeBSD__ #include #endif #include #include #if NPFSYNC > 0 #include #endif /* NPFSYNC > 0 */ #ifdef INET6 #include #include #include #include #ifdef __FreeBSD__ #include #include #endif #endif /* INET6 */ #ifdef __FreeBSD__ #include #include #include #include extern int ip_optcopy(struct ip *, struct ip *); extern int debug_pfugidhack; #endif #define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x /* * Global variables */ struct pf_altqqueue pf_altqs[2]; struct pf_palist pf_pabuf; struct pf_altqqueue *pf_altqs_active; struct pf_altqqueue *pf_altqs_inactive; struct pf_status pf_status; u_int32_t ticket_altqs_active; u_int32_t ticket_altqs_inactive; int altqs_inactive_open; u_int32_t ticket_pabuf; struct pf_anchor_stackframe { struct pf_ruleset *rs; struct pf_rule *r; struct pf_anchor_node *parent; struct pf_anchor *child; } pf_anchor_stack[64]; #ifdef __FreeBSD__ uma_zone_t pf_src_tree_pl, pf_rule_pl; uma_zone_t pf_state_pl, pf_altq_pl, pf_pooladdr_pl; #else struct pool pf_src_tree_pl, pf_rule_pl; struct pool pf_state_pl, pf_altq_pl, pf_pooladdr_pl; #endif void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t); void pf_init_threshold(struct pf_threshold *, u_int32_t, u_int32_t); void pf_add_threshold(struct pf_threshold *); int pf_check_threshold(struct pf_threshold *); void pf_change_ap(struct pf_addr *, u_int16_t *, u_int16_t *, u_int16_t *, struct pf_addr *, u_int16_t, u_int8_t, sa_family_t); int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *); #ifdef INET6 void pf_change_a6(struct pf_addr *, u_int16_t *, struct pf_addr *, u_int8_t); #endif /* INET6 */ void pf_change_icmp(struct pf_addr *, u_int16_t *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t *, u_int16_t *, u_int16_t *, u_int16_t *, u_int8_t, sa_family_t); #ifdef __FreeBSD__ void pf_send_tcp(struct mbuf *, const struct pf_rule *, sa_family_t, #else void pf_send_tcp(const struct pf_rule *, sa_family_t, #endif const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, u_int16_t, struct ether_header *, struct ifnet *); void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_rule *); struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, int, int, struct pfi_kif *, struct pf_addr *, u_int16_t, struct pf_addr *, u_int16_t, int); struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *, int, int, struct pfi_kif *, struct pf_src_node **, struct pf_addr *, u_int16_t, struct pf_addr *, u_int16_t, struct pf_addr *, u_int16_t *); int pf_test_tcp(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, struct pf_rule **, #ifdef __FreeBSD__ struct pf_ruleset **, struct ifqueue *, struct inpcb *); #else struct pf_ruleset **, struct ifqueue *); #endif int pf_test_udp(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, struct pf_rule **, #ifdef __FreeBSD__ struct pf_ruleset **, struct ifqueue *, struct inpcb *); #else struct pf_ruleset **, struct ifqueue *); #endif int pf_test_icmp(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct ifqueue *); int pf_test_other(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct ifqueue *); int pf_test_fragment(struct pf_rule **, int, struct pfi_kif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **); int pf_test_state_tcp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); int pf_test_state_udp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *); int pf_test_state_icmp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); int pf_test_state_other(struct pf_state **, int, struct pfi_kif *, struct pf_pdesc *); int pf_match_tag(struct mbuf *, struct pf_rule *, struct pf_mtag *, int *); int pf_step_out_of_anchor(int *, struct pf_ruleset **, int, struct pf_rule **, struct pf_rule **, int *); void pf_hash(struct pf_addr *, struct pf_addr *, struct pf_poolhashkey *, sa_family_t); int pf_map_addr(u_int8_t, struct pf_rule *, struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_src_node **); int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *, struct pf_addr *, struct pf_addr *, u_int16_t, struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t, struct pf_src_node **); void pf_route(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *); void pf_route6(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *); #ifdef __FreeBSD__ /* XXX: import */ #else int pf_socket_lookup(int, struct pf_pdesc *); #endif u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, sa_family_t); u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, sa_family_t); u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, u_int16_t); void pf_set_rt_ifp(struct pf_state *, struct pf_addr *); int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); int pf_addr_wrap_neq(struct pf_addr_wrap *, struct pf_addr_wrap *); struct pf_state *pf_find_state_recurse(struct pfi_kif *, struct pf_state_cmp *, u_int8_t); int pf_src_connlimit(struct pf_state **); int pf_check_congestion(struct ifqueue *); #ifdef __FreeBSD__ int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); extern int pf_end_threads; struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX]; #else extern struct pool pfr_ktable_pl; extern struct pool pfr_kentry_pl; struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = { { &pf_state_pl, PFSTATE_HIWAT }, { &pf_src_tree_pl, PFSNODE_HIWAT }, { &pf_frent_pl, PFFRAG_FRENT_HIWAT }, { &pfr_ktable_pl, PFR_KTABLE_HIWAT }, { &pfr_kentry_pl, PFR_KENTRY_HIWAT } }; #endif #define STATE_LOOKUP() \ do { \ if (direction == PF_IN) \ *state = pf_find_state_recurse( \ kif, &key, PF_EXT_GWY); \ else \ *state = pf_find_state_recurse( \ kif, &key, PF_LAN_EXT); \ if (*state == NULL || (*state)->timeout == PFTM_PURGE) \ return (PF_DROP); \ if (direction == PF_OUT && \ (((*state)->rule.ptr->rt == PF_ROUTETO && \ (*state)->rule.ptr->direction == PF_OUT) || \ ((*state)->rule.ptr->rt == PF_REPLYTO && \ (*state)->rule.ptr->direction == PF_IN)) && \ (*state)->rt_kif != NULL && \ (*state)->rt_kif != kif) \ return (PF_PASS); \ } while (0) #define STATE_TRANSLATE(s) \ (s)->lan.addr.addr32[0] != (s)->gwy.addr.addr32[0] || \ ((s)->af == AF_INET6 && \ ((s)->lan.addr.addr32[1] != (s)->gwy.addr.addr32[1] || \ (s)->lan.addr.addr32[2] != (s)->gwy.addr.addr32[2] || \ (s)->lan.addr.addr32[3] != (s)->gwy.addr.addr32[3])) || \ (s)->lan.port != (s)->gwy.port #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all #define STATE_INC_COUNTERS(s) \ do { \ s->rule.ptr->states++; \ if (s->anchor.ptr != NULL) \ s->anchor.ptr->states++; \ if (s->nat_rule.ptr != NULL) \ s->nat_rule.ptr->states++; \ } while (0) #define STATE_DEC_COUNTERS(s) \ do { \ if (s->nat_rule.ptr != NULL) \ s->nat_rule.ptr->states--; \ if (s->anchor.ptr != NULL) \ s->anchor.ptr->states--; \ s->rule.ptr->states--; \ } while (0) struct pf_src_tree tree_src_tracking; struct pf_state_tree_id tree_id; struct pf_state_queue state_list; #ifdef __FreeBSD__ static int pf_src_compare(struct pf_src_node *, struct pf_src_node *); static int pf_state_compare_lan_ext(struct pf_state *, struct pf_state *); static int pf_state_compare_ext_gwy(struct pf_state *, struct pf_state *); static int pf_state_compare_id(struct pf_state *, struct pf_state *); #endif RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare); RB_GENERATE(pf_state_tree_lan_ext, pf_state, u.s.entry_lan_ext, pf_state_compare_lan_ext); RB_GENERATE(pf_state_tree_ext_gwy, pf_state, u.s.entry_ext_gwy, pf_state_compare_ext_gwy); RB_GENERATE(pf_state_tree_id, pf_state, u.s.entry_id, pf_state_compare_id); #ifdef __FreeBSD__ static int #else static __inline int #endif pf_src_compare(struct pf_src_node *a, struct pf_src_node *b) { int diff; if (a->rule.ptr > b->rule.ptr) return (1); if (a->rule.ptr < b->rule.ptr) return (-1); if ((diff = a->af - b->af) != 0) return (diff); switch (a->af) { #ifdef INET case AF_INET: if (a->addr.addr32[0] > b->addr.addr32[0]) return (1); if (a->addr.addr32[0] < b->addr.addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->addr.addr32[3] > b->addr.addr32[3]) return (1); if (a->addr.addr32[3] < b->addr.addr32[3]) return (-1); if (a->addr.addr32[2] > b->addr.addr32[2]) return (1); if (a->addr.addr32[2] < b->addr.addr32[2]) return (-1); if (a->addr.addr32[1] > b->addr.addr32[1]) return (1); if (a->addr.addr32[1] < b->addr.addr32[1]) return (-1); if (a->addr.addr32[0] > b->addr.addr32[0]) return (1); if (a->addr.addr32[0] < b->addr.addr32[0]) return (-1); break; #endif /* INET6 */ } return (0); } #ifdef __FreeBSD__ static int #else static __inline int #endif pf_state_compare_lan_ext(struct pf_state *a, struct pf_state *b) { int diff; if ((diff = a->proto - b->proto) != 0) return (diff); if ((diff = a->af - b->af) != 0) return (diff); switch (a->af) { #ifdef INET case AF_INET: if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0]) return (1); if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) return (-1); if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) return (1); if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->lan.addr.addr32[3] > b->lan.addr.addr32[3]) return (1); if (a->lan.addr.addr32[3] < b->lan.addr.addr32[3]) return (-1); if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) return (1); if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) return (-1); if (a->lan.addr.addr32[2] > b->lan.addr.addr32[2]) return (1); if (a->lan.addr.addr32[2] < b->lan.addr.addr32[2]) return (-1); if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) return (1); if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) return (-1); if (a->lan.addr.addr32[1] > b->lan.addr.addr32[1]) return (1); if (a->lan.addr.addr32[1] < b->lan.addr.addr32[1]) return (-1); if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) return (1); if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) return (-1); if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0]) return (1); if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) return (-1); if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) return (1); if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); break; #endif /* INET6 */ } if ((diff = a->lan.port - b->lan.port) != 0) return (diff); if ((diff = a->ext.port - b->ext.port) != 0) return (diff); return (0); } #ifdef __FreeBSD__ static int #else static __inline int #endif pf_state_compare_ext_gwy(struct pf_state *a, struct pf_state *b) { int diff; if ((diff = a->proto - b->proto) != 0) return (diff); if ((diff = a->af - b->af) != 0) return (diff); switch (a->af) { #ifdef INET case AF_INET: if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) return (1); if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) return (1); if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) return (1); if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) return (-1); if (a->gwy.addr.addr32[3] > b->gwy.addr.addr32[3]) return (1); if (a->gwy.addr.addr32[3] < b->gwy.addr.addr32[3]) return (-1); if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) return (1); if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) return (-1); if (a->gwy.addr.addr32[2] > b->gwy.addr.addr32[2]) return (1); if (a->gwy.addr.addr32[2] < b->gwy.addr.addr32[2]) return (-1); if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) return (1); if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) return (-1); if (a->gwy.addr.addr32[1] > b->gwy.addr.addr32[1]) return (1); if (a->gwy.addr.addr32[1] < b->gwy.addr.addr32[1]) return (-1); if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) return (1); if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) return (1); if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) return (-1); break; #endif /* INET6 */ } if ((diff = a->ext.port - b->ext.port) != 0) return (diff); if ((diff = a->gwy.port - b->gwy.port) != 0) return (diff); return (0); } #ifdef __FreeBSD__ static int #else static __inline int #endif pf_state_compare_id(struct pf_state *a, struct pf_state *b) { if (a->id > b->id) return (1); if (a->id < b->id) return (-1); if (a->creatorid > b->creatorid) return (1); if (a->creatorid < b->creatorid) return (-1); return (0); } #ifdef INET6 void pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: dst->addr32[0] = src->addr32[0]; break; #endif /* INET */ case AF_INET6: dst->addr32[0] = src->addr32[0]; dst->addr32[1] = src->addr32[1]; dst->addr32[2] = src->addr32[2]; dst->addr32[3] = src->addr32[3]; break; } } #endif /* INET6 */ struct pf_state * pf_find_state_byid(struct pf_state_cmp *key) { pf_status.fcounters[FCNT_STATE_SEARCH]++; return (RB_FIND(pf_state_tree_id, &tree_id, (struct pf_state *)key)); } struct pf_state * pf_find_state_recurse(struct pfi_kif *kif, struct pf_state_cmp *key, u_int8_t tree) { struct pf_state *s; pf_status.fcounters[FCNT_STATE_SEARCH]++; switch (tree) { case PF_LAN_EXT: if ((s = RB_FIND(pf_state_tree_lan_ext, &kif->pfik_lan_ext, (struct pf_state *)key)) != NULL) return (s); if ((s = RB_FIND(pf_state_tree_lan_ext, &pfi_all->pfik_lan_ext, (struct pf_state *)key)) != NULL) return (s); return (NULL); case PF_EXT_GWY: if ((s = RB_FIND(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, (struct pf_state *)key)) != NULL) return (s); if ((s = RB_FIND(pf_state_tree_ext_gwy, &pfi_all->pfik_ext_gwy, (struct pf_state *)key)) != NULL) return (s); return (NULL); default: panic("pf_find_state_recurse"); } } struct pf_state * pf_find_state_all(struct pf_state_cmp *key, u_int8_t tree, int *more) { struct pf_state *s, *ss = NULL; struct pfi_kif *kif; pf_status.fcounters[FCNT_STATE_SEARCH]++; switch (tree) { case PF_LAN_EXT: TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) { s = RB_FIND(pf_state_tree_lan_ext, &kif->pfik_lan_ext, (struct pf_state *)key); if (s == NULL) continue; if (more == NULL) return (s); ss = s; (*more)++; } return (ss); case PF_EXT_GWY: TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) { s = RB_FIND(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, (struct pf_state *)key); if (s == NULL) continue; if (more == NULL) return (s); ss = s; (*more)++; } return (ss); default: panic("pf_find_state_all"); } } void pf_init_threshold(struct pf_threshold *threshold, u_int32_t limit, u_int32_t seconds) { threshold->limit = limit * PF_THRESHOLD_MULT; threshold->seconds = seconds; threshold->count = 0; threshold->last = time_second; } void pf_add_threshold(struct pf_threshold *threshold) { u_int32_t t = time_second, diff = t - threshold->last; if (diff >= threshold->seconds) threshold->count = 0; else threshold->count -= threshold->count * diff / threshold->seconds; threshold->count += PF_THRESHOLD_MULT; threshold->last = t; } int pf_check_threshold(struct pf_threshold *threshold) { return (threshold->count > threshold->limit); } int pf_src_connlimit(struct pf_state **state) { struct pf_state *s; int bad = 0; (*state)->src_node->conn++; (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); if ((*state)->rule.ptr->max_src_conn && (*state)->rule.ptr->max_src_conn < (*state)->src_node->conn) { pf_status.lcounters[LCNT_SRCCONN]++; bad++; } if ((*state)->rule.ptr->max_src_conn_rate.limit && pf_check_threshold(&(*state)->src_node->conn_rate)) { pf_status.lcounters[LCNT_SRCCONNRATE]++; bad++; } if (!bad) return (0); if ((*state)->rule.ptr->overload_tbl) { struct pfr_addr p; u_int32_t killed = 0; pf_status.lcounters[LCNT_OVERLOAD_TABLE]++; if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf_src_connlimit: blocking address "); pf_print_host(&(*state)->src_node->addr, 0, (*state)->af); } bzero(&p, sizeof(p)); p.pfra_af = (*state)->af; switch ((*state)->af) { #ifdef INET case AF_INET: p.pfra_net = 32; p.pfra_ip4addr = (*state)->src_node->addr.v4; break; #endif /* INET */ #ifdef INET6 case AF_INET6: p.pfra_net = 128; p.pfra_ip6addr = (*state)->src_node->addr.v6; break; #endif /* INET6 */ } pfr_insert_kentry((*state)->rule.ptr->overload_tbl, &p, time_second); /* kill existing states if that's required. */ if ((*state)->rule.ptr->flush) { pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++; RB_FOREACH(s, pf_state_tree_id, &tree_id) { /* * Kill states from this source. (Only those * from the same rule if PF_FLUSH_GLOBAL is not * set) */ if (s->af == (*state)->af && (((*state)->direction == PF_OUT && PF_AEQ(&(*state)->src_node->addr, &s->lan.addr, s->af)) || ((*state)->direction == PF_IN && PF_AEQ(&(*state)->src_node->addr, &s->ext.addr, s->af))) && ((*state)->rule.ptr->flush & PF_FLUSH_GLOBAL || (*state)->rule.ptr == s->rule.ptr)) { s->timeout = PFTM_PURGE; s->src.state = s->dst.state = TCPS_CLOSED; killed++; } } if (pf_status.debug >= PF_DEBUG_MISC) printf(", %u states killed", killed); } if (pf_status.debug >= PF_DEBUG_MISC) printf("\n"); } /* kill this state */ (*state)->timeout = PFTM_PURGE; (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; return (1); } int pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, struct pf_addr *src, sa_family_t af) { struct pf_src_node k; if (*sn == NULL) { k.af = af; PF_ACPY(&k.addr, src, af); if (rule->rule_flag & PFRULE_RULESRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR) k.rule.ptr = rule; else k.rule.ptr = NULL; pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; *sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k); } if (*sn == NULL) { if (!rule->max_src_nodes || rule->src_nodes < rule->max_src_nodes) (*sn) = pool_get(&pf_src_tree_pl, PR_NOWAIT); else pf_status.lcounters[LCNT_SRCNODES]++; if ((*sn) == NULL) return (-1); bzero(*sn, sizeof(struct pf_src_node)); pf_init_threshold(&(*sn)->conn_rate, rule->max_src_conn_rate.limit, rule->max_src_conn_rate.seconds); (*sn)->af = af; if (rule->rule_flag & PFRULE_RULESRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR) (*sn)->rule.ptr = rule; else (*sn)->rule.ptr = NULL; PF_ACPY(&(*sn)->addr, src, af); if (RB_INSERT(pf_src_tree, &tree_src_tracking, *sn) != NULL) { if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: src_tree insert failed: "); pf_print_host(&(*sn)->addr, 0, af); printf("\n"); } pool_put(&pf_src_tree_pl, *sn); return (-1); } (*sn)->creation = time_second; (*sn)->ruletype = rule->action; if ((*sn)->rule.ptr != NULL) (*sn)->rule.ptr->src_nodes++; pf_status.scounters[SCNT_SRC_NODE_INSERT]++; pf_status.src_nodes++; } else { if (rule->max_src_states && (*sn)->states >= rule->max_src_states) { pf_status.lcounters[LCNT_SRCSTATES]++; return (-1); } } return (0); } int pf_insert_state(struct pfi_kif *kif, struct pf_state *state) { /* Thou MUST NOT insert multiple duplicate keys */ state->u.s.kif = kif; if (RB_INSERT(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state)) { if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state insert failed: tree_lan_ext"); printf(" lan: "); pf_print_host(&state->lan.addr, state->lan.port, state->af); printf(" gwy: "); pf_print_host(&state->gwy.addr, state->gwy.port, state->af); printf(" ext: "); pf_print_host(&state->ext.addr, state->ext.port, state->af); if (state->sync_flags & PFSTATE_FROMSYNC) printf(" (from sync)"); printf("\n"); } return (-1); } if (RB_INSERT(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state)) { if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state insert failed: tree_ext_gwy"); printf(" lan: "); pf_print_host(&state->lan.addr, state->lan.port, state->af); printf(" gwy: "); pf_print_host(&state->gwy.addr, state->gwy.port, state->af); printf(" ext: "); pf_print_host(&state->ext.addr, state->ext.port, state->af); if (state->sync_flags & PFSTATE_FROMSYNC) printf(" (from sync)"); printf("\n"); } RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state); return (-1); } if (state->id == 0 && state->creatorid == 0) { state->id = htobe64(pf_status.stateid++); state->creatorid = pf_status.hostid; } if (RB_INSERT(pf_state_tree_id, &tree_id, state) != NULL) { if (pf_status.debug >= PF_DEBUG_MISC) { #ifdef __FreeBSD__ printf("pf: state insert failed: " "id: %016llx creatorid: %08x", (long long)be64toh(state->id), ntohl(state->creatorid)); #else printf("pf: state insert failed: " "id: %016llx creatorid: %08x", betoh64(state->id), ntohl(state->creatorid)); #endif if (state->sync_flags & PFSTATE_FROMSYNC) printf(" (from sync)"); printf("\n"); } RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state); RB_REMOVE(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state); return (-1); } TAILQ_INSERT_TAIL(&state_list, state, u.s.entry_list); pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); #if NPFSYNC pfsync_insert_state(state); #endif return (0); } void pf_purge_thread(void *v) { int nloops = 0, s; for (;;) { tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz); #ifdef __FreeBSD__ sx_slock(&pf_consistency_lock); PF_LOCK(); if (pf_end_threads) { pf_purge_expired_states(pf_status.states); pf_purge_expired_fragments(); pf_purge_expired_src_nodes(0); pf_end_threads++; sx_sunlock(&pf_consistency_lock); PF_UNLOCK(); wakeup(pf_purge_thread); kproc_exit(0); } #endif s = splsoftnet(); /* process a fraction of the state table every second */ pf_purge_expired_states(1 + (pf_status.states / pf_default_rule.timeout[PFTM_INTERVAL])); /* purge other expired types every PFTM_INTERVAL seconds */ if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { pf_purge_expired_fragments(); pf_purge_expired_src_nodes(0); nloops = 0; } splx(s); #ifdef __FreeBSD__ PF_UNLOCK(); sx_sunlock(&pf_consistency_lock); #endif } } u_int32_t pf_state_expires(const struct pf_state *state) { u_int32_t timeout; u_int32_t start; u_int32_t end; u_int32_t states; /* handle all PFTM_* > PFTM_MAX here */ if (state->timeout == PFTM_PURGE) return (time_second); if (state->timeout == PFTM_UNTIL_PACKET) return (0); #ifdef __FreeBSD__ KASSERT(state->timeout != PFTM_UNLINKED, ("pf_state_expires: timeout == PFTM_UNLINKED")); KASSERT((state->timeout < PFTM_MAX), ("pf_state_expires: timeout > PFTM_MAX")); #else KASSERT(state->timeout != PFTM_UNLINKED); KASSERT(state->timeout < PFTM_MAX); #endif timeout = state->rule.ptr->timeout[state->timeout]; if (!timeout) timeout = pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; if (start) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = state->rule.ptr->states; } else { start = pf_default_rule.timeout[PFTM_ADAPTIVE_START]; end = pf_default_rule.timeout[PFTM_ADAPTIVE_END]; states = pf_status.states; } if (end && states > start && start < end) { if (states < end) return (state->expire + timeout * (end - states) / (end - start)); else return (time_second); } return (state->expire + timeout); } void pf_purge_expired_src_nodes(int waslocked) { struct pf_src_node *cur, *next; int locked = waslocked; for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) { next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); if (cur->states <= 0 && cur->expire <= time_second) { if (! locked) { #ifdef __FreeBSD__ if (!sx_try_upgrade(&pf_consistency_lock)) { PF_UNLOCK(); sx_sunlock(&pf_consistency_lock); sx_xlock(&pf_consistency_lock); PF_LOCK(); } #else rw_enter_write(&pf_consistency_lock); #endif next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); locked = 1; } if (cur->rule.ptr != NULL) { cur->rule.ptr->src_nodes--; if (cur->rule.ptr->states <= 0 && cur->rule.ptr->max_src_nodes <= 0) pf_rm_rule(NULL, cur->rule.ptr); } RB_REMOVE(pf_src_tree, &tree_src_tracking, cur); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, cur); } } if (locked && !waslocked) #ifdef __FreeBSD__ sx_downgrade(&pf_consistency_lock); #else rw_exit_write(&pf_consistency_lock); #endif } void pf_src_tree_remove_state(struct pf_state *s) { u_int32_t timeout; if (s->src_node != NULL) { if (s->proto == IPPROTO_TCP) { if (s->src.tcp_est) --s->src_node->conn; } if (--s->src_node->states <= 0) { timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!timeout) timeout = pf_default_rule.timeout[PFTM_SRC_NODE]; s->src_node->expire = time_second + timeout; } } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { if (--s->nat_src_node->states <= 0) { timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!timeout) timeout = pf_default_rule.timeout[PFTM_SRC_NODE]; s->nat_src_node->expire = time_second + timeout; } } s->src_node = s->nat_src_node = NULL; } /* callers should be at splsoftnet */ void pf_unlink_state(struct pf_state *cur) { #ifdef __FreeBSD__ if (cur->local_flags & PFSTATE_EXPIRING) return; cur->local_flags |= PFSTATE_EXPIRING; #endif if (cur->src.state == PF_TCPS_PROXY_DST) { #ifdef __FreeBSD__ pf_send_tcp(NULL, cur->rule.ptr, cur->af, #else pf_send_tcp(cur->rule.ptr, cur->af, #endif &cur->ext.addr, &cur->lan.addr, cur->ext.port, cur->lan.port, cur->src.seqhi, cur->src.seqlo + 1, TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL); } RB_REMOVE(pf_state_tree_ext_gwy, &cur->u.s.kif->pfik_ext_gwy, cur); RB_REMOVE(pf_state_tree_lan_ext, &cur->u.s.kif->pfik_lan_ext, cur); RB_REMOVE(pf_state_tree_id, &tree_id, cur); #if NPFSYNC if (cur->creatorid == pf_status.hostid) pfsync_delete_state(cur); #endif cur->timeout = PFTM_UNLINKED; pf_src_tree_remove_state(cur); } /* callers should be at splsoftnet and hold the * write_lock on pf_consistency_lock */ void pf_free_state(struct pf_state *cur) { #if NPFSYNC if (pfsyncif != NULL && (pfsyncif->sc_bulk_send_next == cur || pfsyncif->sc_bulk_terminator == cur)) return; #endif #ifdef __FreeBSD__ KASSERT(cur->timeout == PFTM_UNLINKED, ("pf_free_state: cur->timeout != PFTM_UNLINKED")); #else KASSERT(cur->timeout == PFTM_UNLINKED); #endif if (--cur->rule.ptr->states <= 0 && cur->rule.ptr->src_nodes <= 0) pf_rm_rule(NULL, cur->rule.ptr); if (cur->nat_rule.ptr != NULL) if (--cur->nat_rule.ptr->states <= 0 && cur->nat_rule.ptr->src_nodes <= 0) pf_rm_rule(NULL, cur->nat_rule.ptr); if (cur->anchor.ptr != NULL) if (--cur->anchor.ptr->states <= 0) pf_rm_rule(NULL, cur->anchor.ptr); pf_normalize_tcp_cleanup(cur); pfi_kif_unref(cur->u.s.kif, PFI_KIF_REF_STATE); TAILQ_REMOVE(&state_list, cur, u.s.entry_list); if (cur->tag) pf_tag_unref(cur->tag); pool_put(&pf_state_pl, cur); pf_status.fcounters[FCNT_STATE_REMOVALS]++; pf_status.states--; } void pf_purge_expired_states(u_int32_t maxcheck) { static struct pf_state *cur = NULL; struct pf_state *next; int locked = 0; while (maxcheck--) { /* wrap to start of list when we hit the end */ if (cur == NULL) { cur = TAILQ_FIRST(&state_list); if (cur == NULL) break; /* list empty */ } /* get next state, as cur may get deleted */ next = TAILQ_NEXT(cur, u.s.entry_list); if (cur->timeout == PFTM_UNLINKED) { /* free unlinked state */ if (! locked) { #ifdef __FreeBSD__ if (!sx_try_upgrade(&pf_consistency_lock)) { PF_UNLOCK(); sx_sunlock(&pf_consistency_lock); sx_xlock(&pf_consistency_lock); PF_LOCK(); } #else rw_enter_write(&pf_consistency_lock); #endif locked = 1; } pf_free_state(cur); } else if (pf_state_expires(cur) <= time_second) { /* unlink and free expired state */ pf_unlink_state(cur); if (! locked) { #ifdef __FreeBSD__ if (!sx_try_upgrade(&pf_consistency_lock)) { PF_UNLOCK(); sx_sunlock(&pf_consistency_lock); sx_xlock(&pf_consistency_lock); PF_LOCK(); } #else rw_enter_write(&pf_consistency_lock); #endif locked = 1; } pf_free_state(cur); } cur = next; } if (locked) #ifdef __FreeBSD__ sx_downgrade(&pf_consistency_lock); #else rw_exit_write(&pf_consistency_lock); #endif } int pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw) { if (aw->type != PF_ADDR_TABLE) return (0); if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL) return (1); return (0); } void pf_tbladdr_remove(struct pf_addr_wrap *aw) { if (aw->type != PF_ADDR_TABLE || aw->p.tbl == NULL) return; pfr_detach_table(aw->p.tbl); aw->p.tbl = NULL; } void pf_tbladdr_copyout(struct pf_addr_wrap *aw) { struct pfr_ktable *kt = aw->p.tbl; if (aw->type != PF_ADDR_TABLE || kt == NULL) return; if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) kt = kt->pfrkt_root; aw->p.tbl = NULL; aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ? kt->pfrkt_cnt : -1; } void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { u_int32_t a = ntohl(addr->addr32[0]); printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, (a>>8)&255, a&255); if (p) { p = ntohs(p); printf(":%u", p); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { u_int16_t b; u_int8_t i, curstart = 255, curend = 0, maxstart = 0, maxend = 0; for (i = 0; i < 8; i++) { if (!addr->addr16[i]) { if (curstart == 255) curstart = i; else curend = i; } else { if (curstart) { if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; curstart = 255; } } } } for (i = 0; i < 8; i++) { if (i >= maxstart && i <= maxend) { if (maxend != 7) { if (i == maxstart) printf(":"); } else { if (i == maxend) printf(":"); } } else { b = ntohs(addr->addr16[i]); printf("%x", b); if (i < 7) printf(":"); } } if (p) { p = ntohs(p); printf("[%u]", p); } break; } #endif /* INET6 */ } } void pf_print_state(struct pf_state *s) { switch (s->proto) { case IPPROTO_TCP: printf("TCP "); break; case IPPROTO_UDP: printf("UDP "); break; case IPPROTO_ICMP: printf("ICMP "); break; case IPPROTO_ICMPV6: printf("ICMPV6 "); break; default: printf("%u ", s->proto); break; } pf_print_host(&s->lan.addr, s->lan.port, s->af); printf(" "); pf_print_host(&s->gwy.addr, s->gwy.port, s->af); printf(" "); pf_print_host(&s->ext.addr, s->ext.port, s->af); printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); printf("]"); printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); printf("]"); printf(" %u:%u", s->src.state, s->dst.state); } void pf_print_flags(u_int8_t f) { if (f) printf(" "); if (f & TH_FIN) printf("F"); if (f & TH_SYN) printf("S"); if (f & TH_RST) printf("R"); if (f & TH_PUSH) printf("P"); if (f & TH_ACK) printf("A"); if (f & TH_URG) printf("U"); if (f & TH_ECE) printf("E"); if (f & TH_CWR) printf("W"); } #define PF_SET_SKIP_STEPS(i) \ do { \ while (head[i] != cur) { \ head[i]->skip[i].ptr = cur; \ head[i] = TAILQ_NEXT(head[i], entries); \ } \ } while (0) void pf_calc_skip_steps(struct pf_rulequeue *rules) { struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; int i; cur = TAILQ_FIRST(rules); prev = cur; for (i = 0; i < PF_SKIP_COUNT; ++i) head[i] = cur; while (cur != NULL) { if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) PF_SET_SKIP_STEPS(PF_SKIP_IFP); if (cur->direction != prev->direction) PF_SET_SKIP_STEPS(PF_SKIP_DIR); if (cur->af != prev->af) PF_SET_SKIP_STEPS(PF_SKIP_AF); if (cur->proto != prev->proto) PF_SET_SKIP_STEPS(PF_SKIP_PROTO); if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); if (cur->src.port[0] != prev->src.port[0] || cur->src.port[1] != prev->src.port[1] || cur->src.port_op != prev->src.port_op) PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); if (cur->dst.port[0] != prev->dst.port[0] || cur->dst.port[1] != prev->dst.port[1] || cur->dst.port_op != prev->dst.port_op) PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); prev = cur; cur = TAILQ_NEXT(cur, entries); } for (i = 0; i < PF_SKIP_COUNT; ++i) PF_SET_SKIP_STEPS(i); } int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { if (aw1->type != aw2->type) return (1); switch (aw1->type) { case PF_ADDR_ADDRMASK: if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0)) return (1); if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0)) return (1); return (0); case PF_ADDR_DYNIFTL: return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (aw1->p.tbl != aw2->p.tbl); case PF_ADDR_RTLABEL: return (aw1->v.rtlabel != aw2->v.rtlabel); default: printf("invalid address type: %d\n", aw1->type); return (1); } } u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { u_int32_t l; if (udp && !cksum) return (0x0000); l = cksum + old - new; l = (l >> 16) + (l & 65535); l = l & 65535; if (udp && !l) return (0xFFFF); return (l); } void pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) { struct pf_addr ao; u_int16_t po = *p; PF_ACPY(&ao, a, af); PF_ACPY(a, an, af); *p = pn; switch (af) { #ifdef INET case AF_INET: *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), po, pn, u); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u), po, pn, u); break; #endif /* INET6 */ } } /* Changes a u_int32_t. Uses a void * so there are no align restrictions */ void pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), ao % 65536, an % 65536, u); } #ifdef INET6 void pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; PF_ACPY(&ao, a, AF_INET6); PF_ACPY(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*c, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); } #endif /* INET6 */ void pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) { struct pf_addr oia, ooa; PF_ACPY(&oia, ia, af); PF_ACPY(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { u_int16_t oip = *ip; u_int32_t opc = 0; /* make the compiler happy */ if (pc != NULL) opc = *pc; *ip = np; if (pc != NULL) *pc = pf_cksum_fixup(*pc, oip, *ip, u); *ic = pf_cksum_fixup(*ic, oip, *ip, 0); if (pc != NULL) *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ PF_ACPY(ia, na, af); switch (af) { #ifdef INET case AF_INET: { u_int32_t oh2c = *h2c; *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); break; } #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], u), oia.addr16[1], ia->addr16[1], u), oia.addr16[2], ia->addr16[2], u), oia.addr16[3], ia->addr16[3], u), oia.addr16[4], ia->addr16[4], u), oia.addr16[5], ia->addr16[5], u), oia.addr16[6], ia->addr16[6], u), oia.addr16[7], ia->addr16[7], u); break; #endif /* INET6 */ } /* Change outer ip address, fix outer ip or icmpv6 checksum. */ PF_ACPY(oa, na, af); switch (af) { #ifdef INET case AF_INET: *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, ooa.addr16[0], oa->addr16[0], 0), ooa.addr16[1], oa->addr16[1], 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, ooa.addr16[0], oa->addr16[0], u), ooa.addr16[1], oa->addr16[1], u), ooa.addr16[2], oa->addr16[2], u), ooa.addr16[3], oa->addr16[3], u), ooa.addr16[4], oa->addr16[4], u), ooa.addr16[5], oa->addr16[5], u), ooa.addr16[6], oa->addr16[6], u), ooa.addr16[7], oa->addr16[7], u); break; #endif /* INET6 */ } } /* * Need to modulate the sequence numbers in the TCP SACK option * (credits to Krzysztof Pfaff for report and patch) */ int pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *dst) { int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; #ifdef __FreeBSD__ u_int8_t opts[TCP_MAXOLEN], *opt = opts; #else u_int8_t opts[MAX_TCPOPTLEN], *opt = opts; #endif int copyback = 0, i, olen; struct sackblk sack; #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) if (hlen < TCPOLEN_SACKLEN || !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) return 0; while (hlen >= TCPOLEN_SACKLEN) { olen = opt[1]; switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_SACK: if (olen > hlen) olen = hlen; if (olen >= TCPOLEN_SACKLEN) { for (i = 2; i + TCPOLEN_SACK <= olen; i += TCPOLEN_SACK) { memcpy(&sack, &opt[i], sizeof(sack)); pf_change_a(&sack.start, &th->th_sum, htonl(ntohl(sack.start) - dst->seqdiff), 0); pf_change_a(&sack.end, &th->th_sum, htonl(ntohl(sack.end) - dst->seqdiff), 0); memcpy(&opt[i], &sack, sizeof(sack)); } copyback = 1; } /* FALLTHROUGH */ default: if (olen < 2) olen = 2; hlen -= olen; opt += olen; } } if (copyback) #ifdef __FreeBSD__ m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); #else m_copyback(m, off + sizeof(*th), thoptlen, opts); #endif return (copyback); } void #ifdef __FreeBSD__ pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, #else pf_send_tcp(const struct pf_rule *r, sa_family_t af, #endif const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp) { struct mbuf *m; int len, tlen; #ifdef INET struct ip *h; #endif /* INET */ #ifdef INET6 struct ip6_hdr *h6; #endif /* INET6 */ struct tcphdr *th; char *opt; struct pf_mtag *pf_mtag; #ifdef __FreeBSD__ KASSERT( #ifdef INET af == AF_INET #else 0 #endif || #ifdef INET6 af == AF_INET6 #else 0 #endif , ("Unsupported AF %d", af)); len = 0; th = NULL; #ifdef INET h = NULL; #endif #ifdef INET6 h6 = NULL; #endif #endif /* maximum segment size tcp option */ tlen = sizeof(struct tcphdr); if (mss) tlen += 4; switch (af) { #ifdef INET case AF_INET: len = sizeof(struct ip) + tlen; break; #endif /* INET */ #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr) + tlen; break; #endif /* INET6 */ } /* create outgoing mbuf */ m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return; #ifdef __FreeBSD__ #ifdef MAC if (replyto) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; #endif #endif if ((pf_mtag = pf_get_mtag(m)) == NULL) { m_freem(m); return; } if (tag) #ifdef __FreeBSD__ m->m_flags |= M_SKIP_FIREWALL; #else pf_mtag->flags |= PF_TAG_GENERATED; #endif pf_mtag->tag = rtag; if (r != NULL && r->rtableid >= 0) pf_mtag->rtableid = r->rtableid; #ifdef ALTQ if (r != NULL && r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->af = af; pf_mtag->hdr = mtod(m, struct ip *); } #endif /* ALTQ */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (af) { #ifdef INET case AF_INET: h = mtod(m, struct ip *); /* IP header fields included in the TCP checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(tlen); h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); /* IP header fields included in the TCP checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(tlen); memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); break; #endif /* INET6 */ } /* TCP header */ th->th_sport = sport; th->th_dport = dport; th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_off = tlen >> 2; th->th_flags = flags; th->th_win = htons(win); if (mss) { opt = (char *)(th + 1); opt[0] = TCPOPT_MAXSEG; opt[1] = 4; HTONS(mss); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); } switch (af) { #ifdef INET case AF_INET: /* TCP checksum */ th->th_sum = in_cksum(m, len); /* Finish the IP header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; #ifdef __FreeBSD__ h->ip_off = path_mtu_discovery ? IP_DF : 0; h->ip_len = len; #else h->ip_off = htons(ip_mtudisc ? IP_DF : 0); h->ip_len = htons(len); #endif h->ip_ttl = ttl ? ttl : ip_defttl; h->ip_sum = 0; if (eh == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); ip_output(m, (void *)NULL, (void *)NULL, 0, (void *)NULL, (void *)NULL); PF_LOCK(); #else /* ! __FreeBSD__ */ ip_output(m, (void *)NULL, (void *)NULL, 0, (void *)NULL, (void *)NULL); #endif } else { struct route ro; struct rtentry rt; struct ether_header *e = (void *)ro.ro_dst.sa_data; if (ifp == NULL) { m_freem(m); return; } rt.rt_ifp = ifp; ro.ro_rt = &rt; ro.ro_dst.sa_len = sizeof(ro.ro_dst); ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT; bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN); bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN); e->ether_type = eh->ether_type; #ifdef __FreeBSD__ PF_UNLOCK(); /* XXX_IMPORT: later */ ip_output(m, (void *)NULL, &ro, 0, (void *)NULL, (void *)NULL); PF_LOCK(); #else /* ! __FreeBSD__ */ ip_output(m, (void *)NULL, &ro, IP_ROUTETOETHER, (void *)NULL, (void *)NULL); #endif } break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* TCP checksum */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; #ifdef __FreeBSD__ PF_UNLOCK(); ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); PF_LOCK(); #else ip6_output(m, NULL, NULL, 0, NULL, NULL); #endif break; #endif /* INET6 */ } } void pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, struct pf_rule *r) { struct pf_mtag *pf_mtag; struct mbuf *m0; #ifdef __FreeBSD__ struct ip *ip; #endif #ifdef __FreeBSD__ m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) return; #else m0 = m_copy(m, 0, M_COPYALL); #endif if ((pf_mtag = pf_get_mtag(m0)) == NULL) return; #ifdef __FreeBSD__ /* XXX: revisit */ m0->m_flags |= M_SKIP_FIREWALL; #else pf_mtag->flags |= PF_TAG_GENERATED; #endif if (r->rtableid >= 0) pf_mtag->rtableid = r->rtableid; #ifdef ALTQ if (r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->af = af; pf_mtag->hdr = mtod(m0, struct ip *); } #endif /* ALTQ */ switch (af) { #ifdef INET case AF_INET: #ifdef __FreeBSD__ /* icmp_error() expects host byte ordering */ ip = mtod(m0, struct ip *); NTOHS(ip->ip_len); NTOHS(ip->ip_off); PF_UNLOCK(); icmp_error(m0, type, code, 0, 0); PF_LOCK(); #else icmp_error(m0, type, code, 0, 0); #endif break; #endif /* INET */ #ifdef INET6 case AF_INET6: #ifdef __FreeBSD__ PF_UNLOCK(); #endif icmp6_error(m0, type, code, 0); #ifdef __FreeBSD__ PF_LOCK(); #endif break; #endif /* INET6 */ } } /* * Return 1 if the addresses a and b match (with mask m), otherwise return 0. * If n is 0, they match if they are equal. If n is != 0, they match if they * are different. */ int pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, struct pf_addr *b, sa_family_t af) { int match = 0; switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) match++; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) && ((a->addr32[1] & m->addr32[1]) == (b->addr32[1] & m->addr32[1])) && ((a->addr32[2] & m->addr32[2]) == (b->addr32[2] & m->addr32[2])) && ((a->addr32[3] & m->addr32[3]) == (b->addr32[3] & m->addr32[3]))) match++; break; #endif /* INET6 */ } if (match) { if (n) return (0); else return (1); } else { if (n) return (1); else return (0); } } int pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) { switch (op) { case PF_OP_IRG: return ((p > a1) && (p < a2)); case PF_OP_XRG: return ((p < a1) || (p > a2)); case PF_OP_RRG: return ((p >= a1) && (p <= a2)); case PF_OP_EQ: return (p == a1); case PF_OP_NE: return (p != a1); case PF_OP_LT: return (p < a1); case PF_OP_LE: return (p <= a1); case PF_OP_GT: return (p > a1); case PF_OP_GE: return (p >= a1); } return (0); /* never reached */ } int pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) { NTOHS(a1); NTOHS(a2); NTOHS(p); return (pf_match(op, a1, a2, p)); } int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } #ifndef __FreeBSD__ struct pf_mtag * pf_find_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) return (NULL); return ((struct pf_mtag *)(mtag + 1)); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) { mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof(struct pf_mtag)); m_tag_prepend(m, mtag); } return ((struct pf_mtag *)(mtag + 1)); } #endif int pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_mtag *pf_mtag, int *tag) { if (*tag == -1) *tag = pf_mtag->tag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } int pf_tag_packet(struct mbuf *m, struct pf_mtag *pf_mtag, int tag, int rtableid) { if (tag <= 0 && rtableid < 0) return (0); if (pf_mtag == NULL) if ((pf_mtag = pf_get_mtag(m)) == NULL) return (1); if (tag > 0) pf_mtag->tag = tag; if (rtableid >= 0) pf_mtag->rtableid = rtableid; return (0); } static void pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; (*r)->anchor->match = 0; if (match) *match = 0; if (*depth >= sizeof(pf_anchor_stack) / sizeof(pf_anchor_stack[0])) { printf("pf_step_into_anchor: stack overflow\n"); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = pf_anchor_stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { f->parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_anchor_node, f->parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->parent = NULL; f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); } int pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; int quick = 0; do { if (*depth <= 0) break; f = pf_anchor_stack + *depth - 1; if (f->parent != NULL && f->child != NULL) { if (f->child->match || (match != NULL && *match)) { f->r->anchor->match = 1; *match = 0; } f->child = RB_NEXT(pf_anchor_node, f->parent, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (f->r->anchor->match || (match != NULL && *match)) quick = f->r->quick; *r = TAILQ_NEXT(f->r, entries); } while (*r == NULL); return (quick); } #ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; } } void pf_addr_inc(struct pf_addr *addr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; if (addr->addr32[2] == 0xffffffff) { addr->addr32[2] = 0; if (addr->addr32[1] == 0xffffffff) { addr->addr32[1] = 0; addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); } else addr->addr32[1] = htonl(ntohl(addr->addr32[1]) + 1); } else addr->addr32[2] = htonl(ntohl(addr->addr32[2]) + 1); } else addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; } } #endif /* INET6 */ #define mix(a,b,c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (0) /* * hash function based on bridge_hash in if_bridge.c */ void pf_hash(struct pf_addr *inaddr, struct pf_addr *hash, struct pf_poolhashkey *key, sa_family_t af) { u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0]; switch (af) { #ifdef INET case AF_INET: a += inaddr->addr32[0]; b += key->key32[1]; mix(a, b, c); hash->addr32[0] = c + key->key32[2]; break; #endif /* INET */ #ifdef INET6 case AF_INET6: a += inaddr->addr32[0]; b += inaddr->addr32[2]; mix(a, b, c); hash->addr32[0] = c; a += inaddr->addr32[1]; b += inaddr->addr32[3]; c += key->key32[1]; mix(a, b, c); hash->addr32[1] = c; a += inaddr->addr32[2]; b += inaddr->addr32[1]; c += key->key32[2]; mix(a, b, c); hash->addr32[2] = c; a += inaddr->addr32[3]; b += inaddr->addr32[0]; c += key->key32[3]; mix(a, b, c); hash->addr32[3] = c; break; #endif /* INET6 */ } } int pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn) { unsigned char hash[16]; struct pf_pool *rpool = &r->rpool; struct pf_addr *raddr = &rpool->cur->addr.v.a.addr; struct pf_addr *rmask = &rpool->cur->addr.v.a.mask; struct pf_pooladdr *acur = rpool->cur; struct pf_src_node k; if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { k.af = af; PF_ACPY(&k.addr, saddr, af); if (r->rule_flag & PFRULE_RULESRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) k.rule.ptr = r; else k.rule.ptr = NULL; pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; *sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k); if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { PF_ACPY(naddr, &(*sn)->raddr, af); if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf_map_addr: src tracking maps "); pf_print_host(&k.addr, 0, af); printf(" to "); pf_print_host(naddr, 0, af); printf("\n"); } return (0); } } if (rpool->cur->addr.type == PF_ADDR_NOROUTE) return (1); if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { switch (af) { #ifdef INET case AF_INET: if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); raddr = &rpool->cur->addr.p.dyn->pfid_addr4; rmask = &rpool->cur->addr.p.dyn->pfid_mask4; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); raddr = &rpool->cur->addr.p.dyn->pfid_addr6; rmask = &rpool->cur->addr.p.dyn->pfid_mask6; break; #endif /* INET6 */ } } else if (rpool->cur->addr.type == PF_ADDR_TABLE) { if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); /* unsupported */ } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; } switch (rpool->opts & PF_POOL_TYPEMASK) { case PF_POOL_NONE: PF_ACPY(naddr, raddr, af); break; case PF_POOL_BITMASK: PF_POOLMASK(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: if (init_addr != NULL && PF_AZERO(init_addr, af)) { switch (af) { #ifdef INET case AF_INET: rpool->counter.addr32[0] = htonl(arc4random()); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (rmask->addr32[3] != 0xffffffff) rpool->counter.addr32[3] = htonl(arc4random()); else break; if (rmask->addr32[2] != 0xffffffff) rpool->counter.addr32[2] = htonl(arc4random()); else break; if (rmask->addr32[1] != 0xffffffff) rpool->counter.addr32[1] = htonl(arc4random()); else break; if (rmask->addr32[0] != 0xffffffff) rpool->counter.addr32[0] = htonl(arc4random()); break; #endif /* INET6 */ } PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); PF_ACPY(init_addr, naddr, af); } else { PF_AINC(&rpool->counter, af); PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); } break; case PF_POOL_SRCHASH: pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); break; case PF_POOL_ROUNDROBIN: if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (!pfr_pool_get(rpool->cur->addr.p.tbl, &rpool->tblidx, &rpool->counter, &raddr, &rmask, af)) goto get_addr; } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, &rpool->tblidx, &rpool->counter, &raddr, &rmask, af)) goto get_addr; } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) goto get_addr; try_next: if ((rpool->cur = TAILQ_NEXT(rpool->cur, entries)) == NULL) rpool->cur = TAILQ_FIRST(&rpool->list); if (rpool->cur->addr.type == PF_ADDR_TABLE) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.tbl, &rpool->tblidx, &rpool->counter, &raddr, &rmask, af)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; return (1); } } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, &rpool->tblidx, &rpool->counter, &raddr, &rmask, af)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; return (1); } } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; PF_ACPY(&rpool->counter, raddr, af); } get_addr: PF_ACPY(naddr, &rpool->counter, af); if (init_addr != NULL && PF_AZERO(init_addr, af)) PF_ACPY(init_addr, naddr, af); PF_AINC(&rpool->counter, af); break; } if (*sn != NULL) PF_ACPY(&(*sn)->raddr, naddr, af); if (pf_status.debug >= PF_DEBUG_MISC && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { printf("pf_map_addr: selected address "); pf_print_host(naddr, 0, af); printf("\n"); } return (0); } int pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport, struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high, struct pf_src_node **sn) { struct pf_state_cmp key; struct pf_addr init_addr; u_int16_t cut; bzero(&init_addr, sizeof(init_addr)); if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) return (1); if (proto == IPPROTO_ICMP) { low = 1; high = 65535; } do { key.af = af; key.proto = proto; PF_ACPY(&key.ext.addr, daddr, key.af); PF_ACPY(&key.gwy.addr, naddr, key.af); key.ext.port = dport; /* * port search; start random, step; * similar 2 portloop in in_pcbbind */ if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_ICMP)) { key.gwy.port = dport; if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) return (0); } else if (low == 0 && high == 0) { key.gwy.port = *nport; if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) return (0); } else if (low == high) { key.gwy.port = htons(low); if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) { *nport = htons(low); return (0); } } else { u_int16_t tmp; if (low > high) { tmp = low; low = high; high = tmp; } /* low < high */ cut = htonl(arc4random()) % (1 + high - low) + low; /* low <= cut <= high */ for (tmp = cut; tmp <= high; ++(tmp)) { key.gwy.port = htons(tmp); if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) { *nport = htons(tmp); return (0); } } for (tmp = cut - 1; tmp >= low; --(tmp)) { key.gwy.port = htons(tmp); if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) { *nport = htons(tmp); return (0); } } } switch (r->rpool.opts & PF_POOL_TYPEMASK) { case PF_POOL_RANDOM: case PF_POOL_ROUNDROBIN: if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) return (1); break; case PF_POOL_NONE: case PF_POOL_SRCHASH: case PF_POOL_BITMASK: default: return (1); } } while (! PF_AEQ(&init_addr, naddr, af) ); return (1); /* none available */ } struct pf_rule * pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, struct pf_addr *daddr, u_int16_t dport, int rs_num) { struct pf_rule *r, *rm = NULL; struct pf_ruleset *ruleset = NULL; int tag = -1; int rtableid = -1; int asd = 0; r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr); while (r && rm == NULL) { struct pf_rule_addr *src = NULL, *dst = NULL; struct pf_addr_wrap *xdst = NULL; if (r->action == PF_BINAT && direction == PF_IN) { src = &r->dst; if (r->rpool.cur != NULL) xdst = &r->rpool.cur->addr; } else { src = &r->src; dst = &r->dst; } r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != pd->af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, src->neg, kif)) r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : PF_SKIP_DST_ADDR].ptr; else if (src->port_op && !pf_match_port(src->port_op, src->port[0], src->port[1], sport)) r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : PF_SKIP_DST_PORT].ptr; else if (dst != NULL && PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, 0, NULL)) r = TAILQ_NEXT(r, entries); else if (dst != NULL && dst->port_op && !pf_match_port(dst->port_op, dst->port[0], dst->port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m, off, pd->hdr.tcp), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { rm = r; } else pf_step_into_anchor(&asd, &ruleset, rs_num, &r, NULL, NULL); } if (r == NULL) pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r, NULL, NULL); } if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) return (NULL); if (rm != NULL && (rm->action == PF_NONAT || rm->action == PF_NORDR || rm->action == PF_NOBINAT)) return (NULL); return (rm); } struct pf_rule * pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_src_node **sn, struct pf_addr *saddr, u_int16_t sport, struct pf_addr *daddr, u_int16_t dport, struct pf_addr *naddr, u_int16_t *nport) { struct pf_rule *r = NULL; if (direction == PF_OUT) { r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_BINAT); if (r == NULL) r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_NAT); } else { r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_RDR); if (r == NULL) r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_BINAT); } if (r != NULL) { switch (r->action) { case PF_NONAT: case PF_NOBINAT: case PF_NORDR: return (NULL); case PF_NAT: if (pf_get_sport(pd->af, pd->proto, r, saddr, daddr, dport, naddr, nport, r->rpool.proxy_port[0], r->rpool.proxy_port[1], sn)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation " "(%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); return (NULL); } break; case PF_BINAT: switch (direction) { case PF_OUT: if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){ switch (pd->af) { #ifdef INET case AF_INET: if (r->rpool.cur->addr.p.dyn-> pfid_acnt4 < 1) return (NULL); PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr4, &r->rpool.cur->addr.p.dyn-> pfid_mask4, saddr, AF_INET); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (r->rpool.cur->addr.p.dyn-> pfid_acnt6 < 1) return (NULL); PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr6, &r->rpool.cur->addr.p.dyn-> pfid_mask6, saddr, AF_INET6); break; #endif /* INET6 */ } } else PF_POOLMASK(naddr, &r->rpool.cur->addr.v.a.addr, &r->rpool.cur->addr.v.a.mask, saddr, pd->af); break; case PF_IN: if (r->src.addr.type == PF_ADDR_DYNIFTL) { switch (pd->af) { #ifdef INET case AF_INET: if (r->src.addr.p.dyn-> pfid_acnt4 < 1) return (NULL); PF_POOLMASK(naddr, &r->src.addr.p.dyn-> pfid_addr4, &r->src.addr.p.dyn-> pfid_mask4, daddr, AF_INET); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (r->src.addr.p.dyn-> pfid_acnt6 < 1) return (NULL); PF_POOLMASK(naddr, &r->src.addr.p.dyn-> pfid_addr6, &r->src.addr.p.dyn-> pfid_mask6, daddr, AF_INET6); break; #endif /* INET6 */ } } else PF_POOLMASK(naddr, &r->src.addr.v.a.addr, &r->src.addr.v.a.mask, daddr, pd->af); break; } break; case PF_RDR: { if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) return (NULL); if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask, daddr, pd->af); if (r->rpool.proxy_port[1]) { u_int32_t tmp_nport; tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) % (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] + 1)) + r->rpool.proxy_port[0]; /* wrap around if necessary */ if (tmp_nport > 65535) tmp_nport -= 65535; *nport = htons((u_int16_t)tmp_nport); } else if (r->rpool.proxy_port[0]) *nport = htons(r->rpool.proxy_port[0]); break; } default: return (NULL); } } return (r); } int #ifdef __FreeBSD__ pf_socket_lookup(int direction, struct pf_pdesc *pd, struct inpcb *inp_arg) #else pf_socket_lookup(int direction, struct pf_pdesc *pd) #endif { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; #ifdef __FreeBSD__ struct inpcbinfo *pi; #else struct inpcbtable *tb; #endif struct inpcb *inp; if (pd == NULL) return (-1); pd->lookup.uid = UID_MAX; pd->lookup.gid = GID_MAX; pd->lookup.pid = NO_PID; /* XXX: revisit */ #ifdef __FreeBSD__ if (inp_arg != NULL) { - INP_LOCK_ASSERT(inp_arg); + INP_WLOCK_ASSERT(inp_arg); if (inp_arg->inp_socket) { pd->lookup.uid = inp_arg->inp_socket->so_cred->cr_uid; pd->lookup.gid = inp_arg->inp_socket->so_cred->cr_groups[0]; return (1); } else return (-1); } #endif switch (pd->proto) { case IPPROTO_TCP: if (pd->hdr.tcp == NULL) return (-1); sport = pd->hdr.tcp->th_sport; dport = pd->hdr.tcp->th_dport; #ifdef __FreeBSD__ pi = &tcbinfo; #else tb = &tcbtable; #endif break; case IPPROTO_UDP: if (pd->hdr.udp == NULL) return (-1); sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; #ifdef __FreeBSD__ pi = &udbinfo; #else tb = &udbtable; #endif break; default: return (-1); } if (direction == PF_IN) { saddr = pd->src; daddr = pd->dst; } else { u_int16_t p; p = sport; sport = dport; dport = p; saddr = pd->dst; daddr = pd->src; } switch (pd->af) { #ifdef INET case AF_INET: #ifdef __FreeBSD__ INP_INFO_RLOCK(pi); /* XXX LOR */ inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, 0, NULL); if (inp == NULL) { inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); if(inp == NULL) { INP_INFO_RUNLOCK(pi); return (-1); } } #else inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport); if (inp == NULL) { inp = in_pcblookup_listen(tb, daddr->v4, dport, 0); if (inp == NULL) return (-1); } #endif break; #endif /* INET */ #ifdef INET6 case AF_INET6: #ifdef __FreeBSD__ INP_INFO_RLOCK(pi); inp = in6_pcblookup_hash(pi, &saddr->v6, sport, &daddr->v6, dport, 0, NULL); if (inp == NULL) { inp = in6_pcblookup_hash(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); if (inp == NULL) { INP_INFO_RUNLOCK(pi); return (-1); } } #else inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6, dport); if (inp == NULL) { inp = in6_pcblookup_listen(tb, &daddr->v6, dport, 0); if (inp == NULL) return (-1); } #endif break; #endif /* INET6 */ default: return (-1); } #ifdef __FreeBSD__ - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_socket == NULL) || (inp->inp_socket->so_cred == NULL)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(pi); return (-1); } pd->lookup.uid = inp->inp_socket->so_cred->cr_uid; pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0]; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(pi); #else pd->lookup.uid = inp->inp_socket->so_euid; pd->lookup.gid = inp->inp_socket->so_egid; pd->lookup.pid = inp->inp_socket->so_cpid; #endif return (1); } u_int8_t pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int8_t wscale = 0; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= 3) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_WINDOW: wscale = opt[2]; if (wscale > TCP_MAX_WINSHIFT) wscale = TCP_MAX_WINSHIFT; wscale |= PF_WSCALE_FLAG; /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (wscale); } u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int16_t mss = tcp_mssdflt; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= TCPOLEN_MAXSEG) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_MAXSEG: bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); NTOHS(mss); /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (mss); } u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) { #ifdef INET struct sockaddr_in *dst; struct route ro; #endif /* INET */ #ifdef INET6 struct sockaddr_in6 *dst6; struct route_in6 ro6; #endif /* INET6 */ struct rtentry *rt = NULL; int hlen = 0; /* make the compiler happy */ u_int16_t mss = tcp_mssdflt; switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; #ifdef __FreeBSD__ #ifdef RTF_PRCLONING rtalloc_ign(&ro, (RTF_CLONING | RTF_PRCLONING)); #else /* !RTF_PRCLONING */ rtalloc_ign(&ro, RTF_CLONING); #endif #else /* ! __FreeBSD__ */ rtalloc_noclone(&ro, NO_CLONING); #endif rt = ro.ro_rt; break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); bzero(&ro6, sizeof(ro6)); dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; #ifdef __FreeBSD__ #ifdef RTF_PRCLONING rtalloc_ign((struct route *)&ro6, (RTF_CLONING | RTF_PRCLONING)); #else /* !RTF_PRCLONING */ rtalloc_ign((struct route *)&ro6, RTF_CLONING); #endif #else /* ! __FreeBSD__ */ rtalloc_noclone((struct route *)&ro6, NO_CLONING); #endif rt = ro6.ro_rt; break; #endif /* INET6 */ } if (rt && rt->rt_ifp) { mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr); mss = max(tcp_mssdflt, mss); RTFREE(rt); } mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); } void pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr) { struct pf_rule *r = s->rule.ptr; s->rt_kif = NULL; if (!r->rt || r->rt == PF_FASTROUTE) return; switch (s->af) { #ifdef INET case AF_INET: pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &s->nat_src_node); s->rt_kif = r->rpool.cur->kif; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &s->nat_src_node); s->rt_kif = r->rpool.cur->kif; break; #endif /* INET6 */ } } int pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, #ifdef __FreeBSD__ struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq, struct inpcb *inp) #else struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq) #endif { struct pf_rule *nr = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; struct tcphdr *th = pd->hdr.tcp; u_int16_t bport, nport = 0; sa_family_t af = pd->af; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; u_short reason; int rewrite = 0; int tag = -1, rtableid = -1; u_int16_t mss = tcp_mssdflt; int asd = 0; int match = 0; if (pf_check_congestion(ifq)) { REASON_SET(&reason, PFRES_CONGEST); return (PF_DROP); } #ifdef __FreeBSD__ if (inp != NULL) pd->lookup.done = pf_socket_lookup(direction, pd, inp); else if (debug_pfugidhack) { PF_UNLOCK(); DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n")); pd->lookup.done = pf_socket_lookup(direction, pd, inp); PF_LOCK(); } #endif r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); if (direction == PF_OUT) { bport = nport = th->th_sport; /* check outgoing packet for BINAT/NAT */ if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, saddr, th->th_sport, daddr, th->th_dport, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, saddr, af); pf_change_ap(saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &pd->naddr, nport, 0, af); rewrite++; if (nr->natpass) r = NULL; pd->nat_rule = nr; } } else { bport = nport = th->th_dport; /* check incoming packet for BINAT/RDR */ if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, saddr, th->th_sport, daddr, th->th_dport, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, daddr, af); pf_change_ap(daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &pd->naddr, nport, 0, af); rewrite++; if (nr->natpass) r = NULL; pd->nat_rule = nr; } } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != IPPROTO_TCP) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if ((r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = #ifdef __FreeBSD__ pf_socket_lookup(direction, pd, inp), 1)) && #else pf_socket_lookup(direction, pd), 1)) && #endif !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = #ifdef __FreeBSD__ pf_socket_lookup(direction, pd, inp), 1)) && #else pf_socket_lookup(direction, pd), 1)) && #endif !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->natpass && nr->log)) { if (rewrite) #ifdef __FreeBSD__ m_copyback(m, off, sizeof(*th), (caddr_t)th); #else m_copyback(m, off, sizeof(*th), th); #endif PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { /* undo NAT changes, if they have taken place */ if (nr != NULL) { if (direction == PF_OUT) { pf_change_ap(saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &pd->baddr, bport, 0, af); rewrite++; } else { pf_change_ap(daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &pd->baddr, bport, 0, af); rewrite++; } } if (((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURN)) && !(th->th_flags & TH_RST)) { u_int32_t ack = ntohl(th->th_seq) + pd->p_len; if (th->th_flags & TH_SYN) ack++; if (th->th_flags & TH_FIN) ack++; #ifdef __FreeBSD__ pf_send_tcp(m, r, af, pd->dst, #else pf_send_tcp(r, af, pd->dst, #endif pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp); } else if ((af == AF_INET) && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if ((af == AF_INET6) && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } if (r->action == PF_DROP) return (PF_DROP); if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } if (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM)) { /* create new state */ u_int16_t len; struct pf_state *s = NULL; struct pf_src_node *sn = NULL; len = pd->tot_len - off - (th->th_off << 2); /* check maximums */ if (r->max_states && (r->states >= r->max_states)) { pf_status.lcounters[LCNT_STATES]++; REASON_SET(&reason, PFRES_MAXSTATES); goto cleanup; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, saddr, af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && ((direction == PF_OUT && pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } s = pool_get(&pf_state_pl, PR_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); cleanup: if (sn != NULL && sn->states == 0 && sn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, sn); } if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, nsn); } return (PF_DROP); } bzero(s, sizeof(*s)); s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); s->allow_opts = r->allow_opts; s->log = r->log & PF_LOG_ALL; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; s->proto = IPPROTO_TCP; s->direction = direction; s->af = af; if (direction == PF_OUT) { PF_ACPY(&s->gwy.addr, saddr, af); s->gwy.port = th->th_sport; /* sport */ PF_ACPY(&s->ext.addr, daddr, af); s->ext.port = th->th_dport; if (nr != NULL) { PF_ACPY(&s->lan.addr, &pd->baddr, af); s->lan.port = bport; } else { PF_ACPY(&s->lan.addr, &s->gwy.addr, af); s->lan.port = s->gwy.port; } } else { PF_ACPY(&s->lan.addr, daddr, af); s->lan.port = th->th_dport; PF_ACPY(&s->ext.addr, saddr, af); s->ext.port = th->th_sport; if (nr != NULL) { PF_ACPY(&s->gwy.addr, &pd->baddr, af); s->gwy.port = bport; } else { PF_ACPY(&s->gwy.addr, &s->lan.addr, af); s->gwy.port = s->lan.port; } } s->src.seqlo = ntohl(th->th_seq); s->src.seqhi = s->src.seqlo + len + 1; if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_MODULATE) { /* Generate sequence number modulator */ #ifdef __FreeBSD__ while ((s->src.seqdiff = pf_new_isn(s) - s->src.seqlo) == 0) ; #else while ((s->src.seqdiff = tcp_rndiss_next() - s->src.seqlo) == 0) ; #endif pf_change_a(&th->th_seq, &th->th_sum, htonl(s->src.seqlo + s->src.seqdiff), 0); rewrite = 1; } else s->src.seqdiff = 0; if (th->th_flags & TH_SYN) { s->src.seqhi++; s->src.wscale = pf_get_wscale(m, off, th->th_off, af); } s->src.max_win = MAX(ntohs(th->th_win), 1); if (s->src.wscale & PF_WSCALE_MASK) { /* Remove scale factor from initial window */ int win = s->src.max_win; win += 1 << (s->src.wscale & PF_WSCALE_MASK); s->src.max_win = (win - 1) >> (s->src.wscale & PF_WSCALE_MASK); } if (th->th_flags & TH_FIN) s->src.seqhi++; s->dst.seqhi = 1; s->dst.max_win = 1; s->src.state = TCPS_SYN_SENT; s->dst.state = TCPS_CLOSED; s->creation = time_second; s->expire = time_second; s->timeout = PFTM_TCP_FIRST_PACKET; pf_set_rt_ifp(s, saddr); if (sn != NULL) { s->src_node = sn; s->src_node->states++; } if (nsn != NULL) { PF_ACPY(&nsn->raddr, &pd->naddr, af); s->nat_src_node = nsn; s->nat_src_node->states++; } if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { REASON_SET(&reason, PFRES_MEMORY); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, &s->src, &s->dst, &rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, ("pf_normalize_tcp_stateful failed on first pkt")); pf_normalize_tcp_cleanup(s); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } if (pf_insert_state(BOUND_IFACE(r, kif), s)) { pf_normalize_tcp_cleanup(s); REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } else *sm = s; if (tag > 0) { pf_tag_ref(tag); s->tag = tag; } if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { s->src.state = PF_TCPS_PROXY_SRC; if (nr != NULL) { if (direction == PF_OUT) { pf_change_ap(saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &pd->baddr, bport, 0, af); } else { pf_change_ap(daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &pd->baddr, bport, 0, af); } } s->src.seqhi = htonl(arc4random()); /* Find mss option */ mss = pf_get_mss(m, off, th->th_off, af); mss = pf_calc_mss(saddr, af, mss); mss = pf_calc_mss(daddr, af, mss); s->src.mss = mss; #ifdef __FreeBSD__ pf_send_tcp(NULL, r, af, daddr, saddr, th->th_dport, #else pf_send_tcp(r, af, daddr, saddr, th->th_dport, #endif th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL); REASON_SET(&reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); } int pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, #ifdef __FreeBSD__ struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq, struct inpcb *inp) #else struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq) #endif { struct pf_rule *nr = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; struct udphdr *uh = pd->hdr.udp; u_int16_t bport, nport = 0; sa_family_t af = pd->af; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; u_short reason; int rewrite = 0; int tag = -1, rtableid = -1; int asd = 0; int match = 0; if (pf_check_congestion(ifq)) { REASON_SET(&reason, PFRES_CONGEST); return (PF_DROP); } #ifdef __FreeBSD__ if (inp != NULL) pd->lookup.done = pf_socket_lookup(direction, pd, inp); else if (debug_pfugidhack) { PF_UNLOCK(); DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n")); pd->lookup.done = pf_socket_lookup(direction, pd, inp); PF_LOCK(); } #endif r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); if (direction == PF_OUT) { bport = nport = uh->uh_sport; /* check outgoing packet for BINAT/NAT */ if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, saddr, uh->uh_sport, daddr, uh->uh_dport, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, saddr, af); pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &pd->naddr, nport, 1, af); rewrite++; if (nr->natpass) r = NULL; pd->nat_rule = nr; } } else { bport = nport = uh->uh_dport; /* check incoming packet for BINAT/RDR */ if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, saddr, uh->uh_sport, daddr, uh->uh_dport, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, daddr, af); pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &pd->naddr, nport, 1, af); rewrite++; if (nr->natpass) r = NULL; pd->nat_rule = nr; } } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != IPPROTO_UDP) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], uh->uh_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], uh->uh_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = #ifdef __FreeBSD__ pf_socket_lookup(direction, pd, inp), 1)) && #else pf_socket_lookup(direction, pd), 1)) && #endif !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = #ifdef __FreeBSD__ pf_socket_lookup(direction, pd, inp), 1)) && #else pf_socket_lookup(direction, pd), 1)) && #endif !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->natpass && nr->log)) { if (rewrite) #ifdef __FreeBSD__ m_copyback(m, off, sizeof(*uh), (caddr_t)uh); #else m_copyback(m, off, sizeof(*uh), uh); #endif PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { /* undo NAT changes, if they have taken place */ if (nr != NULL) { if (direction == PF_OUT) { pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &pd->baddr, bport, 1, af); rewrite++; } else { pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &pd->baddr, bport, 1, af); rewrite++; } } if ((af == AF_INET) && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if ((af == AF_INET6) && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } if (r->action == PF_DROP) return (PF_DROP); if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } if (r->keep_state || nr != NULL) { /* create new state */ struct pf_state *s = NULL; struct pf_src_node *sn = NULL; /* check maximums */ if (r->max_states && (r->states >= r->max_states)) { pf_status.lcounters[LCNT_STATES]++; REASON_SET(&reason, PFRES_MAXSTATES); goto cleanup; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, saddr, af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && ((direction == PF_OUT && pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } s = pool_get(&pf_state_pl, PR_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); cleanup: if (sn != NULL && sn->states == 0 && sn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, sn); } if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, nsn); } return (PF_DROP); } bzero(s, sizeof(*s)); s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); s->allow_opts = r->allow_opts; s->log = r->log & PF_LOG_ALL; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; s->proto = IPPROTO_UDP; s->direction = direction; s->af = af; if (direction == PF_OUT) { PF_ACPY(&s->gwy.addr, saddr, af); s->gwy.port = uh->uh_sport; PF_ACPY(&s->ext.addr, daddr, af); s->ext.port = uh->uh_dport; if (nr != NULL) { PF_ACPY(&s->lan.addr, &pd->baddr, af); s->lan.port = bport; } else { PF_ACPY(&s->lan.addr, &s->gwy.addr, af); s->lan.port = s->gwy.port; } } else { PF_ACPY(&s->lan.addr, daddr, af); s->lan.port = uh->uh_dport; PF_ACPY(&s->ext.addr, saddr, af); s->ext.port = uh->uh_sport; if (nr != NULL) { PF_ACPY(&s->gwy.addr, &pd->baddr, af); s->gwy.port = bport; } else { PF_ACPY(&s->gwy.addr, &s->lan.addr, af); s->gwy.port = s->lan.port; } } s->src.state = PFUDPS_SINGLE; s->dst.state = PFUDPS_NO_TRAFFIC; s->creation = time_second; s->expire = time_second; s->timeout = PFTM_UDP_FIRST_PACKET; pf_set_rt_ifp(s, saddr); if (sn != NULL) { s->src_node = sn; s->src_node->states++; } if (nsn != NULL) { PF_ACPY(&nsn->raddr, &pd->naddr, af); s->nat_src_node = nsn; s->nat_src_node->states++; } if (pf_insert_state(BOUND_IFACE(r, kif), s)) { REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } else *sm = s; if (tag > 0) { pf_tag_ref(tag); s->tag = tag; } } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, sizeof(*uh), (caddr_t)uh); return (PF_PASS); } int pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq) { struct pf_rule *nr = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; u_short reason; u_int16_t icmpid = 0, bport, nport = 0; sa_family_t af = pd->af; u_int8_t icmptype = 0; /* make the compiler happy */ u_int8_t icmpcode = 0; /* make the compiler happy */ int state_icmp = 0; int tag = -1, rtableid = -1; #ifdef INET6 int rewrite = 0; #endif /* INET6 */ int asd = 0; int match = 0; if (pf_check_congestion(ifq)) { REASON_SET(&reason, PFRES_CONGEST); return (PF_DROP); } switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; icmpid = pd->hdr.icmp->icmp_id; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; icmpid = pd->hdr.icmp6->icmp6_id; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); if (direction == PF_OUT) { bport = nport = icmpid; /* check outgoing packet for BINAT/NAT */ if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, saddr, af); switch (af) { #ifdef INET case AF_INET: pf_change_a(&saddr->v4.s_addr, pd->ip_sum, pd->naddr.v4.s_addr, 0); pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, nport, 0); pd->hdr.icmp->icmp_id = nport; m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &pd->naddr, 0); rewrite++; break; #endif /* INET6 */ } if (nr->natpass) r = NULL; pd->nat_rule = nr; } } else { bport = nport = icmpid; /* check incoming packet for BINAT/RDR */ if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) != NULL) { PF_ACPY(&pd->baddr, daddr, af); switch (af) { #ifdef INET case AF_INET: pf_change_a(&daddr->v4.s_addr, pd->ip_sum, pd->naddr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &pd->naddr, 0); rewrite++; break; #endif /* INET6 */ } if (nr->natpass) r = NULL; pd->nat_rule = nr; } } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->type && r->type != icmptype + 1) r = TAILQ_NEXT(r, entries); else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->natpass && nr->log)) { #ifdef INET6 if (rewrite) m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); #endif /* INET6 */ PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd); } if (r->action != PF_PASS) return (PF_DROP); if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } if (!state_icmp && (r->keep_state || nr != NULL)) { /* create new state */ struct pf_state *s = NULL; struct pf_src_node *sn = NULL; /* check maximums */ if (r->max_states && (r->states >= r->max_states)) { pf_status.lcounters[LCNT_STATES]++; REASON_SET(&reason, PFRES_MAXSTATES); goto cleanup; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, saddr, af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && ((direction == PF_OUT && pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } s = pool_get(&pf_state_pl, PR_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); cleanup: if (sn != NULL && sn->states == 0 && sn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, sn); } if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, nsn); } return (PF_DROP); } bzero(s, sizeof(*s)); s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); s->allow_opts = r->allow_opts; s->log = r->log & PF_LOG_ALL; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; s->proto = pd->proto; s->direction = direction; s->af = af; if (direction == PF_OUT) { PF_ACPY(&s->gwy.addr, saddr, af); s->gwy.port = nport; PF_ACPY(&s->ext.addr, daddr, af); s->ext.port = 0; if (nr != NULL) { PF_ACPY(&s->lan.addr, &pd->baddr, af); s->lan.port = bport; } else { PF_ACPY(&s->lan.addr, &s->gwy.addr, af); s->lan.port = s->gwy.port; } } else { PF_ACPY(&s->lan.addr, daddr, af); s->lan.port = nport; PF_ACPY(&s->ext.addr, saddr, af); s->ext.port = 0; if (nr != NULL) { PF_ACPY(&s->gwy.addr, &pd->baddr, af); s->gwy.port = bport; } else { PF_ACPY(&s->gwy.addr, &s->lan.addr, af); s->gwy.port = s->lan.port; } } s->creation = time_second; s->expire = time_second; s->timeout = PFTM_ICMP_FIRST_PACKET; pf_set_rt_ifp(s, saddr); if (sn != NULL) { s->src_node = sn; s->src_node->states++; } if (nsn != NULL) { PF_ACPY(&nsn->raddr, &pd->naddr, af); s->nat_src_node = nsn; s->nat_src_node->states++; } if (pf_insert_state(BOUND_IFACE(r, kif), s)) { REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } else *sm = s; if (tag > 0) { pf_tag_ref(tag); s->tag = tag; } } #ifdef INET6 /* copy back packet headers if we performed IPv6 NAT operations */ if (rewrite) m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); #endif /* INET6 */ return (PF_PASS); } int pf_test_other(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq) { struct pf_rule *nr = NULL; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; sa_family_t af = pd->af; u_short reason; int tag = -1, rtableid = -1; int asd = 0; int match = 0; if (pf_check_congestion(ifq)) { REASON_SET(&reason, PFRES_CONGEST); return (PF_DROP); } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); if (direction == PF_OUT) { /* check outgoing packet for BINAT/NAT */ if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) { PF_ACPY(&pd->baddr, saddr, af); switch (af) { #ifdef INET case AF_INET: pf_change_a(&saddr->v4.s_addr, pd->ip_sum, pd->naddr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: PF_ACPY(saddr, &pd->naddr, af); break; #endif /* INET6 */ } if (nr->natpass) r = NULL; pd->nat_rule = nr; } } else { /* check incoming packet for BINAT/RDR */ if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) { PF_ACPY(&pd->baddr, daddr, af); switch (af) { #ifdef INET case AF_INET: pf_change_a(&daddr->v4.s_addr, pd->ip_sum, pd->naddr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: PF_ACPY(daddr, &pd->naddr, af); break; #endif /* INET6 */ } if (nr->natpass) r = NULL; pd->nat_rule = nr; } } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->natpass && nr->log)) PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd); if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { struct pf_addr *a = NULL; if (nr != NULL) { if (direction == PF_OUT) a = saddr; else a = daddr; } if (a != NULL) { switch (af) { #ifdef INET case AF_INET: pf_change_a(&a->v4.s_addr, pd->ip_sum, pd->baddr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: PF_ACPY(a, &pd->baddr, af); break; #endif /* INET6 */ } } if ((af == AF_INET) && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if ((af == AF_INET6) && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } if (r->action != PF_PASS) return (PF_DROP); if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } if (r->keep_state || nr != NULL) { /* create new state */ struct pf_state *s = NULL; struct pf_src_node *sn = NULL; /* check maximums */ if (r->max_states && (r->states >= r->max_states)) { pf_status.lcounters[LCNT_STATES]++; REASON_SET(&reason, PFRES_MAXSTATES); goto cleanup; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, saddr, af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && ((direction == PF_OUT && pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { REASON_SET(&reason, PFRES_SRCLIMIT); goto cleanup; } s = pool_get(&pf_state_pl, PR_NOWAIT); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); cleanup: if (sn != NULL && sn->states == 0 && sn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, sn); } if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; pf_status.src_nodes--; pool_put(&pf_src_tree_pl, nsn); } return (PF_DROP); } bzero(s, sizeof(*s)); s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); s->allow_opts = r->allow_opts; s->log = r->log & PF_LOG_ALL; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; s->proto = pd->proto; s->direction = direction; s->af = af; if (direction == PF_OUT) { PF_ACPY(&s->gwy.addr, saddr, af); PF_ACPY(&s->ext.addr, daddr, af); if (nr != NULL) PF_ACPY(&s->lan.addr, &pd->baddr, af); else PF_ACPY(&s->lan.addr, &s->gwy.addr, af); } else { PF_ACPY(&s->lan.addr, daddr, af); PF_ACPY(&s->ext.addr, saddr, af); if (nr != NULL) PF_ACPY(&s->gwy.addr, &pd->baddr, af); else PF_ACPY(&s->gwy.addr, &s->lan.addr, af); } s->src.state = PFOTHERS_SINGLE; s->dst.state = PFOTHERS_NO_TRAFFIC; s->creation = time_second; s->expire = time_second; s->timeout = PFTM_OTHER_FIRST_PACKET; pf_set_rt_ifp(s, saddr); if (sn != NULL) { s->src_node = sn; s->src_node->states++; } if (nsn != NULL) { PF_ACPY(&nsn->raddr, &pd->naddr, af); s->nat_src_node = nsn; s->nat_src_node->states++; } if (pf_insert_state(BOUND_IFACE(r, kif), s)) { REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); pool_put(&pf_state_pl, s); return (PF_DROP); } else *sm = s; if (tag > 0) { pf_tag_ref(tag); s->tag = tag; } } return (PF_PASS); } int pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm) { struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; sa_family_t af = pd->af; u_short reason; int tag = -1; int asd = 0; int match = 0; r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->src.port_op || r->dst.port_op || r->flagset)) r = TAILQ_NEXT(r, entries); else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else { if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log) PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset, pd); if (r->action != PF_PASS) return (PF_DROP); if (pf_tag_packet(m, pd->pf_mtag, tag, -1)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } return (PF_PASS); } int pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_state_cmp key; struct tcphdr *th = pd->hdr.tcp; u_int16_t win = ntohs(th->th_win); u_int32_t ack, end, seq, orig_seq; u_int8_t sws, dws; int ackskew; int copyback = 0; struct pf_state_peer *src, *dst; key.af = pd->af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); key.ext.port = th->th_sport; key.gwy.port = th->th_dport; } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); key.lan.port = th->th_sport; key.ext.port = th->th_dport; } STATE_LOOKUP(); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } if ((*state)->src.state == PF_TCPS_PROXY_SRC) { if (direction != (*state)->direction) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } if (th->th_flags & TH_SYN) { if (ntohl(th->th_seq) != (*state)->src.seqlo) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } #ifdef __FreeBSD__ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, #else pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, #endif pd->src, th->th_dport, th->th_sport, (*state)->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (!(th->th_flags & TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else (*state)->src.state = PF_TCPS_PROXY_DST; } if ((*state)->src.state == PF_TCPS_PROXY_DST) { struct pf_state_host *src, *dst; if (direction == PF_OUT) { src = &(*state)->gwy; dst = &(*state)->ext; } else { src = &(*state)->ext; dst = &(*state)->lan; } if (direction == (*state)->direction) { if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } (*state)->src.max_win = MAX(ntohs(th->th_win), 1); if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(arc4random()); #ifdef __FreeBSD__ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &src->addr, #else pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr, #endif &dst->addr, src->port, dst->port, (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (((th->th_flags & (TH_SYN|TH_ACK)) != (TH_SYN|TH_ACK)) || (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else { (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); (*state)->dst.seqlo = ntohl(th->th_seq); #ifdef __FreeBSD__ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, #else pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, #endif pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ntohl(th->th_seq) + 1, TH_ACK, (*state)->src.max_win, 0, 0, 0, (*state)->tag, NULL, NULL); #ifdef __FreeBSD__ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &src->addr, #else pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr, #endif &dst->addr, src->port, dst->port, (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL, NULL); (*state)->src.seqdiff = (*state)->dst.seqhi - (*state)->src.seqlo; (*state)->dst.seqdiff = (*state)->src.seqhi - (*state)->dst.seqlo; (*state)->src.seqhi = (*state)->src.seqlo + (*state)->dst.max_win; (*state)->dst.seqhi = (*state)->dst.seqlo + (*state)->src.max_win; (*state)->src.wscale = (*state)->dst.wscale = 0; (*state)->src.state = (*state)->dst.state = TCPS_ESTABLISHED; REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { sws = src->wscale & PF_WSCALE_MASK; dws = dst->wscale & PF_WSCALE_MASK; } else sws = dws = 0; /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(th->th_seq); if (src->seqlo == 0) { /* First packet from this end. Set its state */ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && src->scrub == NULL) { if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } } /* Deferred generation of sequence number modulator */ if (dst->seqdiff && !src->seqdiff) { #ifdef __FreeBSD__ while ((src->seqdiff = pf_new_isn(*state) - seq) == 0) ; #else while ((src->seqdiff = tcp_rndiss_next() - seq) == 0) ; #endif ack = ntohl(th->th_ack) - dst->seqdiff; pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); copyback = 1; } else { ack = ntohl(th->th_ack); } end = seq + pd->p_len; if (th->th_flags & TH_SYN) { end++; if (dst->wscale & PF_WSCALE_FLAG) { src->wscale = pf_get_wscale(m, off, th->th_off, pd->af); if (src->wscale & PF_WSCALE_FLAG) { /* Remove scale factor from initial * window */ sws = src->wscale & PF_WSCALE_MASK; win = ((u_int32_t)win + (1 << sws) - 1) >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (th->th_flags & TH_FIN) end++; src->seqlo = seq; if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); if (win > src->max_win) src->max_win = win; } else { ack = ntohl(th->th_ack) - dst->seqdiff; if (src->seqdiff) { /* Modulate sequence numbers */ pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); copyback = 1; } end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; if (th->th_flags & TH_FIN) end++; } if ((th->th_flags & TH_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || /* broken tcp stacks do not set ack */ (dst->state < TCPS_SYN_SENT)) { /* * Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; /* * Need to demodulate the sequence numbers in any TCP SACK options * (Selective ACK). We could optionally validate the SACK values * against the current ACK window, either forwards or backwards, but * I'm not confident that SACK has been implemented properly * everywhere. It wouldn't surprise me if several stacks accidently * SACK too far backwards of previously ACKed data. There really aren't * any security implications of bad SACKing unless the target stack * doesn't validate the option length correctly. Someone trying to * spoof into a TCP connection won't bother blindly sending SACK * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { if (pf_modulate_sack(m, off, pd, th, dst)) copyback = 1; } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ (ackskew <= (MAXACKWINDOW << sws)) && /* Acking not more than one window forward */ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (pd->flags & PFDESC_IP_REAS) == 0)) { /* Require an exact/+1 sequence match on resets when possible */ if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, ©back)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* update states */ if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) dst->state = TCPS_FIN_WAIT_2; } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_second; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; /* Fall through to PASS packet */ } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: loose state match: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu\n", seq, orig_seq, ack, pd->p_len, #ifdef __FreeBSD__ ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1]); #else ackskew, (*state)->packets[0], (*state)->packets[1]); #endif } if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, ©back)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* Fall through to PASS packet */ } else { if ((*state)->dst.state == TCPS_SYN_SENT && (*state)->src.state == TCPS_SYN_SENT) { /* Send RST for state mismatches during handshake */ if (!(th->th_flags & TH_RST)) #ifdef __FreeBSD__ pf_send_tcp(m, (*state)->rule.ptr, pd->af, #else pf_send_tcp((*state)->rule.ptr, pd->af, #endif pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), 0, TH_RST, 0, 0, (*state)->rule.ptr->return_ttl, 1, 0, pd->eh, kif->pfik_ifp); src->seqlo = 0; src->seqhi = 1; src->max_win = 1; } else if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD state: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, #ifdef __FreeBSD__ (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], #else (*state)->packets[0], (*state)->packets[1], #endif direction == PF_IN ? "in" : "out", direction == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } /* Any packets which have gotten here are to be passed */ /* translate source/destination address, if necessary */ if (STATE_TRANSLATE(*state)) { if (direction == PF_OUT) pf_change_ap(pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &(*state)->gwy.addr, (*state)->gwy.port, 0, pd->af); else pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &(*state)->lan.addr, (*state)->lan.port, 0, pd->af); m_copyback(m, off, sizeof(*th), (caddr_t)th); } else if (copyback) { /* Copyback sequence modulation or stateful scrub changes */ m_copyback(m, off, sizeof(*th), (caddr_t)th); } return (PF_PASS); } int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_cmp key; struct udphdr *uh = pd->hdr.udp; key.af = pd->af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); key.ext.port = uh->uh_sport; key.gwy.port = uh->uh_dport; } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); key.lan.port = uh->uh_sport; key.ext.port = uh->uh_dport; } STATE_LOOKUP(); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFUDPS_SINGLE) src->state = PFUDPS_SINGLE; if (dst->state == PFUDPS_SINGLE) dst->state = PFUDPS_MULTIPLE; /* update expire time */ (*state)->expire = time_second; if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) (*state)->timeout = PFTM_UDP_MULTIPLE; else (*state)->timeout = PFTM_UDP_SINGLE; /* translate source/destination address, if necessary */ if (STATE_TRANSLATE(*state)) { if (direction == PF_OUT) pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &(*state)->gwy.addr, (*state)->gwy.port, 1, pd->af); else pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &(*state)->lan.addr, (*state)->lan.port, 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); } return (PF_PASS); } int pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0; /* make the compiler happy */ u_int16_t *icmpsum = NULL; /* make the compiler happy */ u_int8_t icmptype = 0; /* make the compiler happy */ int state_icmp = 0; struct pf_state_cmp key; switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } if (!state_icmp) { /* * ICMP query/reply message not related to a TCP/UDP packet. * Search for an ICMP state. */ key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); key.ext.port = 0; key.gwy.port = icmpid; } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); key.lan.port = icmpid; key.ext.port = 0; } STATE_LOOKUP(); (*state)->expire = time_second; (*state)->timeout = PFTM_ICMP_ERROR_REPLY; /* translate source/destination address, if necessary */ if (STATE_TRANSLATE(*state)) { if (direction == PF_OUT) { switch (pd->af) { #ifdef INET case AF_INET: pf_change_a(&saddr->v4.s_addr, pd->ip_sum, (*state)->gwy.addr.v4.s_addr, 0); pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, (*state)->gwy.port, 0); pd->hdr.icmp->icmp_id = (*state)->gwy.port; m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &(*state)->gwy.addr, 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); break; #endif /* INET6 */ } } else { switch (pd->af) { #ifdef INET case AF_INET: pf_change_a(&daddr->v4.s_addr, pd->ip_sum, (*state)->lan.addr.v4.s_addr, 0); pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, (*state)->lan.port, 0); pd->hdr.icmp->icmp_id = (*state)->lan.port; m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &(*state)->lan.addr, 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); break; #endif /* INET6 */ } } } return (PF_PASS); } else { /* * ICMP error message in response to a TCP/UDP packet. * Extract the inner TCP/UDP header and search for that state. */ struct pf_pdesc pd2; #ifdef INET struct ip h2; #endif /* INET */ #ifdef INET6 struct ip6_hdr h2_6; int terminal = 0; #endif /* INET6 */ int ipoff2 = 0; /* make the compiler happy */ int off2 = 0; /* make the compiler happy */ pd2.af = pd->af; switch (pd->af) { #ifdef INET case AF_INET: /* offset of h2 in mbuf chain */ ipoff2 = off + ICMP_MINLEN; if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip)\n")); return (PF_DROP); } /* * ICMP error messages don't refer to non-first * fragments */ if (h2.ip_off & htons(IP_OFFMASK)) { REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* offset of protocol header that follows h2 */ off2 = ipoff2 + (h2.ip_hl << 2); pd2.proto = h2.ip_p; pd2.src = (struct pf_addr *)&h2.ip_src; pd2.dst = (struct pf_addr *)&h2.ip_dst; pd2.ip_sum = &h2.ip_sum; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipoff2 = off + sizeof(struct icmp6_hdr); if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip6)\n")); return (PF_DROP); } pd2.proto = h2_6.ip6_nxt; pd2.src = (struct pf_addr *)&h2_6.ip6_src; pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; pd2.ip_sum = NULL; off2 = ipoff2 + sizeof(h2_6); do { switch (pd2.proto) { case IPPROTO_FRAGMENT: /* * ICMPv6 error messages for * non-first fragments */ REASON_SET(reason, PFRES_FRAG); return (PF_DROP); case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off2, &opt6, sizeof(opt6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMPv6 short opt\n")); return (PF_DROP); } if (pd2.proto == IPPROTO_AH) off2 += (opt6.ip6e_len + 2) * 4; else off2 += (opt6.ip6e_len + 1) * 8; pd2.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); break; #endif /* INET6 */ #ifdef __FreeBSD__ default: panic("AF not supported: %d", pd->af); #endif } switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; u_int32_t seq; struct pf_state_peer *src, *dst; u_int8_t dws; int copyback = 0; /* * Only the first 8 bytes of the TCP header can be * expected. Don't access any TCP header fields after * th_seq, an ackskew test is not possible. */ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(tcp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); key.ext.port = th.th_dport; key.gwy.port = th.th_sport; } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); key.lan.port = th.th_dport; key.ext.port = th.th_sport; } STATE_LOOKUP(); if (direction == (*state)->direction) { src = &(*state)->dst; dst = &(*state)->src; } else { src = &(*state)->src; dst = &(*state)->dst; } if (src->wscale && dst->wscale) dws = dst->wscale & PF_WSCALE_MASK; else dws = 0; /* Demodulate sequence number */ seq = ntohl(th.th_seq) - src->seqdiff; if (src->seqdiff) { pf_change_a(&th.th_seq, icmpsum, htonl(seq), 0); copyback = 1; } if (!SEQ_GEQ(src->seqhi, seq) || !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) { if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, pd->hdr.icmp->icmp_code); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } if (STATE_TRANSLATE(*state)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, &th.th_sport, daddr, &(*state)->lan.addr, (*state)->lan.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } else { pf_change_icmp(pd2.dst, &th.th_dport, saddr, &(*state)->gwy.addr, (*state)->gwy.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } copyback = 1; } if (copyback) { switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, 8, (caddr_t)&th); } return (PF_PASS); break; } case IPPROTO_UDP: { struct udphdr uh; if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(udp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); key.ext.port = uh.uh_dport; key.gwy.port = uh.uh_sport; } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); key.lan.port = uh.uh_dport; key.ext.port = uh.uh_sport; } STATE_LOOKUP(); if (STATE_TRANSLATE(*state)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &(*state)->lan.addr, (*state)->lan.port, &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); } else { pf_change_icmp(pd2.dst, &uh.uh_dport, saddr, &(*state)->gwy.addr, (*state)->gwy.port, &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); } switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); } return (PF_PASS); break; } #ifdef INET case IPPROTO_ICMP: { struct icmp iih; if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short i" "(icmp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); key.ext.port = 0; key.gwy.port = iih.icmp_id; } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); key.lan.port = iih.icmp_id; key.ext.port = 0; } STATE_LOOKUP(); if (STATE_TRANSLATE(*state)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &(*state)->lan.addr, (*state)->lan.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); } else { pf_change_icmp(pd2.dst, &iih.icmp_id, saddr, &(*state)->gwy.addr, (*state)->gwy.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); } m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: { struct icmp6_hdr iih; if (!pf_pull_hdr(m, off2, &iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(icmp6)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMPV6; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); key.ext.port = 0; key.gwy.port = iih.icmp6_id; } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); key.lan.port = iih.icmp6_id; key.ext.port = 0; } STATE_LOOKUP(); if (STATE_TRANSLATE(*state)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &(*state)->lan.addr, (*state)->lan.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); } else { pf_change_icmp(pd2.dst, &iih.icmp6_id, saddr, &(*state)->gwy.addr, (*state)->gwy.port, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); } m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); m_copyback(m, off2, sizeof(struct icmp6_hdr), (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET6 */ default: { key.af = pd2.af; key.proto = pd2.proto; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); key.ext.port = 0; key.gwy.port = 0; } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); key.lan.port = 0; key.ext.port = 0; } STATE_LOOKUP(); if (STATE_TRANSLATE(*state)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, NULL, daddr, &(*state)->lan.addr, 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } else { pf_change_icmp(pd2.dst, NULL, saddr, &(*state)->gwy.addr, 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); break; #endif /* INET6 */ } } return (PF_PASS); break; } } } } int pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_cmp key; key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); key.ext.port = 0; key.gwy.port = 0; } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); key.lan.port = 0; key.ext.port = 0; } STATE_LOOKUP(); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFOTHERS_SINGLE) src->state = PFOTHERS_SINGLE; if (dst->state == PFOTHERS_SINGLE) dst->state = PFOTHERS_MULTIPLE; /* update expire time */ (*state)->expire = time_second; if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) (*state)->timeout = PFTM_OTHER_MULTIPLE; else (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ if (STATE_TRANSLATE(*state)) { if (direction == PF_OUT) switch (pd->af) { #ifdef INET case AF_INET: pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, (*state)->gwy.addr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: PF_ACPY(pd->src, &(*state)->gwy.addr, pd->af); break; #endif /* INET6 */ } else switch (pd->af) { #ifdef INET case AF_INET: pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, (*state)->lan.addr.v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: PF_ACPY(pd->dst, &(*state)->lan.addr, pd->af); break; #endif /* INET6 */ } } return (PF_PASS); } /* * ipoff and off are measured from the start of the mbuf chain. * h must be at "ipoff" on the mbuf chain. */ void * pf_pull_hdr(struct mbuf *m, int off, void *p, int len, u_short *actionp, u_short *reasonp, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; if (fragoff) { if (fragoff >= len) ACTION_SET(actionp, PF_PASS); else { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_FRAG); } return (NULL); } if (m->m_pkthdr.len < off + len || ntohs(h->ip_len) < off + len) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (m->m_pkthdr.len < off + len || (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < (unsigned)(off + len)) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET6 */ } m_copydata(m, off, len, p); return (p); } int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif) { struct sockaddr_in *dst; int ret = 1; int check_mpath; #ifndef __FreeBSD__ extern int ipmultipath; #endif #ifdef INET6 #ifndef __FreeBSD__ extern int ip6_multipath; #endif struct sockaddr_in6 *dst6; struct route_in6 ro; #else struct route ro; #endif struct radix_node *rn; struct rtentry *rt; struct ifnet *ifp; check_mpath = 0; bzero(&ro, sizeof(ro)); switch (af) { case AF_INET: dst = satosin(&ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; #ifndef __FreeBSD__ /* MULTIPATH_ROUTING */ if (ipmultipath) check_mpath = 1; #endif break; #ifdef INET6 case AF_INET6: dst6 = (struct sockaddr_in6 *)&ro.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; #ifndef __FreeBSD__ /* MULTIPATH_ROUTING */ if (ip6_multipath) check_mpath = 1; #endif break; #endif /* INET6 */ default: return (0); } /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) goto out; #ifdef __FreeBSD__ rtalloc_ign((struct route *)&ro, RTF_CLONING); #else /* ! __FreeBSD__ */ rtalloc_noclone((struct route *)&ro, NO_CLONING); #endif if (ro.ro_rt != NULL) { /* No interface given, this is a no-route check */ if (kif == NULL) goto out; if (kif->pfik_ifp == NULL) { ret = 0; goto out; } /* Perform uRPF check if passed input interface */ ret = 0; rn = (struct radix_node *)ro.ro_rt; do { rt = (struct rtentry *)rn; #ifndef __FreeBSD__ /* CARPDEV */ if (rt->rt_ifp->if_type == IFT_CARP) ifp = rt->rt_ifp->if_carpdev; else #endif ifp = rt->rt_ifp; if (kif->pfik_ifp == ifp) ret = 1; #ifdef __FreeBSD__ /* MULTIPATH_ROUTING */ rn = NULL; #else rn = rn_mpath_next(rn); #endif } while (check_mpath == 1 && rn != NULL && ret == 0); } else ret = 0; out: if (ro.ro_rt != NULL) RTFREE(ro.ro_rt); return (ret); } int pf_rtlabel_match(struct pf_addr *addr, sa_family_t af, struct pf_addr_wrap *aw) { struct sockaddr_in *dst; #ifdef INET6 struct sockaddr_in6 *dst6; struct route_in6 ro; #else struct route ro; #endif int ret = 0; bzero(&ro, sizeof(ro)); switch (af) { case AF_INET: dst = satosin(&ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; break; #ifdef INET6 case AF_INET6: dst6 = (struct sockaddr_in6 *)&ro.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; break; #endif /* INET6 */ default: return (0); } #ifdef __FreeBSD__ # ifdef RTF_PRCLONING rtalloc_ign((struct route *)&ro, (RTF_CLONING|RTF_PRCLONING)); # else /* !RTF_PRCLONING */ rtalloc_ign((struct route *)&ro, RTF_CLONING); # endif #else /* ! __FreeBSD__ */ rtalloc_noclone((struct route *)&ro, NO_CLONING); #endif if (ro.ro_rt != NULL) { #ifdef __FreeBSD__ /* XXX_IMPORT: later */ #else if (ro.ro_rt->rt_labelid == aw->v.rtlabel) ret = 1; #endif RTFREE(ro.ro_rt); } return (ret); } #ifdef INET void pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd) { struct mbuf *m0, *m1; struct route iproute; struct route *ro = NULL; struct sockaddr_in *dst; struct ip *ip; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; int error = 0; #ifdef __FreeBSD__ int sw_csum; #endif #ifdef IPSEC struct m_tag *mtag; #endif /* IPSEC */ if (m == NULL || *m == NULL || r == NULL || (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route: invalid parameters"); if (pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad; } if (r->rt == PF_DUPTO) { #ifdef __FreeBSD__ if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL) #else if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL) #endif return; } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) return; m0 = *m; } if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: m0->m_len < sizeof(struct ip)\n")); goto bad; } ip = mtod(m0, struct ip *); ro = &iproute; bzero((caddr_t)ro, sizeof(*ro)); dst = satosin(&ro->ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; if (r->rt == PF_FASTROUTE) { rtalloc(ro); if (ro->ro_rt == 0) { ipstat.ips_noroute++; goto bad; } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = satosin(ro->ro_rt->rt_gateway); } else { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n")); goto bad; } if (s == NULL) { pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET)) dst->sin_addr.s_addr = naddr.v4.s_addr; ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET)) dst->sin_addr.s_addr = s->rt_addr.v4.s_addr; ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } } if (ifp == NULL) goto bad; if (oifp != ifp) { #ifdef __FreeBSD__ PF_UNLOCK(); if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { PF_LOCK(); goto bad; } else if (m0 == NULL) { PF_LOCK(); goto done; } PF_LOCK(); #else if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; #endif if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: m0->m_len < sizeof(struct ip)\n")); goto bad; } ip = mtod(m0, struct ip *); } #ifdef __FreeBSD__ /* Copied from FreeBSD 5.1-CURRENT ip_output. */ m0->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { /* * XXX: in_delayed_cksum assumes HBO for ip->ip_len (at least) */ NTOHS(ip->ip_len); NTOHS(ip->ip_off); /* XXX: needed? */ in_delayed_cksum(m0); HTONS(ip->ip_len); HTONS(ip->ip_off); sw_csum &= ~CSUM_DELAY_DATA; } m0->m_pkthdr.csum_flags &= ifp->if_hwassist; if (ntohs(ip->ip_len) <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && ((ip->ip_off & htons(IP_DF)) == 0))) { /* * ip->ip_len = htons(ip->ip_len); * ip->ip_off = htons(ip->ip_off); */ ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { /* From KAME */ if (ip->ip_v == IPVERSION && (ip->ip_hl << 2) == sizeof(*ip)) { ip->ip_sum = in_cksum_hdr(ip); } else { ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); } } PF_UNLOCK(); error = (*ifp->if_output)(ifp, m0, sintosa(dst), ro->ro_rt); PF_LOCK(); goto done; } #else /* Copied from ip_output. */ #ifdef IPSEC /* * If deferred crypto processing is needed, check that the * interface supports it. */ if ((mtag = m_tag_find(m0, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL)) != NULL && (ifp->if_capabilities & IFCAP_IPSEC) == 0) { /* Notify IPsec to do its own crypto. */ ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); goto bad; } #endif /* IPSEC */ /* Catch routing changes wrt. hardware checksumming for TCP or UDP. */ if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT) { if (!(ifp->if_capabilities & IFCAP_CSUM_TCPv4) || ifp->if_bridge != NULL) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~M_TCPV4_CSUM_OUT; /* Clear */ } } else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT) { if (!(ifp->if_capabilities & IFCAP_CSUM_UDPv4) || ifp->if_bridge != NULL) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~M_UDPV4_CSUM_OUT; /* Clear */ } } if (ntohs(ip->ip_len) <= ifp->if_mtu) { if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) && ifp->if_bridge == NULL) { m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; ipstat.ips_outhwcsum++; } else { ip->ip_sum = 0; ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); } /* Update relevant hardware checksum stats for TCP/UDP */ if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT) tcpstat.tcps_outhwcsum++; else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT) udpstat.udps_outhwcsum++; error = (*ifp->if_output)(ifp, m0, sintosa(dst), NULL); goto done; } #endif /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ if (ip->ip_off & htons(IP_DF)) { ipstat.ips_cantfrag++; if (r->rt != PF_DUPTO) { #ifdef __FreeBSD__ /* icmp_error() expects host byte ordering */ NTOHS(ip->ip_len); NTOHS(ip->ip_off); PF_UNLOCK(); icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); PF_LOCK(); #else icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); #endif goto done; } else goto bad; } m1 = m0; #ifdef __FreeBSD__ /* * XXX: is cheaper + less error prone than own function */ NTOHS(ip->ip_len); NTOHS(ip->ip_off); error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum); #else error = ip_fragment(m0, ifp, ifp->if_mtu); #endif if (error) { #ifndef __FreeBSD__ /* ip_fragment does not do m_freem() on FreeBSD */ m0 = NULL; #endif goto bad; } for (m0 = m1; m0; m0 = m1) { m1 = m0->m_nextpkt; m0->m_nextpkt = 0; #ifdef __FreeBSD__ if (error == 0) { PF_UNLOCK(); error = (*ifp->if_output)(ifp, m0, sintosa(dst), NULL); PF_LOCK(); } else #else if (error == 0) error = (*ifp->if_output)(ifp, m0, sintosa(dst), NULL); else #endif m_freem(m0); } if (error == 0) ipstat.ips_fragmented++; done: if (r->rt != PF_DUPTO) *m = NULL; if (ro == &iproute && ro->ro_rt) RTFREE(ro->ro_rt); return; bad: m_freem(m0); goto done; } #endif /* INET */ #ifdef INET6 void pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd) { struct mbuf *m0; struct route_in6 ip6route; struct route_in6 *ro; struct sockaddr_in6 *dst; struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; int error = 0; if (m == NULL || *m == NULL || r == NULL || (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route6: invalid parameters"); if (pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad; } if (r->rt == PF_DUPTO) { #ifdef __FreeBSD__ if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL) #else if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL) #endif return; } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) return; m0 = *m; } if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n")); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); dst = (struct sockaddr_in6 *)&ro->ro_dst; dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = ip6->ip6_dst; /* Cheat. XXX why only in the v6 case??? */ if (r->rt == PF_FASTROUTE) { #ifdef __FreeBSD__ m0->m_flags |= M_SKIP_FIREWALL; PF_UNLOCK(); ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); PF_LOCK(); #else mtag = m_tag_get(PACKET_TAG_PF_GENERATED, 0, M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m0, mtag); pd->pf_mtag->flags |= PF_TAG_GENERATED; ip6_output(m0, NULL, NULL, 0, NULL, NULL); #endif return; } if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n")); goto bad; } if (s == NULL) { pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst->sin6_addr, &naddr, AF_INET6); ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst->sin6_addr, &s->rt_addr, AF_INET6); ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (ifp == NULL) goto bad; if (oifp != ifp) { #ifdef __FreeBSD__ PF_UNLOCK(); if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { PF_LOCK(); goto bad; } else if (m0 == NULL) { PF_LOCK(); goto done; } PF_LOCK(); #else if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; #endif if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n")); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); } /* * If the packet is too large for the outgoing interface, * send back an icmp6 error. */ if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr)) dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif error = nd6_output(ifp, ifp, m0, dst, NULL); #ifdef __FreeBSD__ PF_LOCK(); #endif } else { in6_ifstat_inc(ifp, ifs6_in_toobig); #ifdef __FreeBSD__ if (r->rt != PF_DUPTO) { PF_UNLOCK(); icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); PF_LOCK(); } else #else if (r->rt != PF_DUPTO) icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); else #endif goto bad; } done: if (r->rt != PF_DUPTO) *m = NULL; return; bad: m_freem(m0); goto done; } #endif /* INET6 */ #ifdef __FreeBSD__ /* * FreeBSD supports cksum offloads for the following drivers. * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4), * ti(4), txp(4), xl(4) * * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : * network driver performed cksum including pseudo header, need to verify * csum_data * CSUM_DATA_VALID : * network driver performed cksum, needs to additional pseudo header * cksum computation with partial csum_data(i.e. lack of H/W support for * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) * * After validating the cksum of packet, set both flag CSUM_DATA_VALID and * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper * TCP/UDP layer. * Also, set csum_data to 0xffff to force cksum validation. */ int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t sum = 0; int hw_assist = 0; struct ip *ip; if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (p) { case IPPROTO_TCP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_TCP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_UDP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ break; default: return (1); } if (!hw_assist) { switch (af) { case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } } if (sum) { switch (p) { case IPPROTO_TCP: tcpstat.tcps_rcvbadsum++; break; case IPPROTO_UDP: udpstat.udps_badsum++; break; case IPPROTO_ICMP: icmpstat.icps_checksum++; break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6stat.icp6s_checksum++; break; #endif /* INET6 */ } return (1); } else { if (p == IPPROTO_TCP || p == IPPROTO_UDP) { m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } return (0); } #else /* !__FreeBSD__ */ /* * check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag * off is the offset where the protocol header starts * len is the total length of protocol header plus payload * returns 0 when the checksum is valid, otherwise returns 1. */ int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t flag_ok, flag_bad; u_int16_t sum; switch (p) { case IPPROTO_TCP: flag_ok = M_TCP_CSUM_IN_OK; flag_bad = M_TCP_CSUM_IN_BAD; break; case IPPROTO_UDP: flag_ok = M_UDP_CSUM_IN_OK; flag_bad = M_UDP_CSUM_IN_BAD; break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ flag_ok = flag_bad = 0; break; default: return (1); } if (m->m_pkthdr.csum_flags & flag_ok) return (0); if (m->m_pkthdr.csum_flags & flag_bad) return (1); if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (af) { #ifdef INET case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } if (sum) { m->m_pkthdr.csum_flags |= flag_bad; switch (p) { case IPPROTO_TCP: tcpstat.tcps_rcvbadsum++; break; case IPPROTO_UDP: udpstat.udps_badsum++; break; case IPPROTO_ICMP: icmpstat.icps_checksum++; break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6stat.icp6s_checksum++; break; #endif /* INET6 */ } return (1); } m->m_pkthdr.csum_flags |= flag_ok; return (0); } #endif /* __FreeBSD__ */ #ifdef INET int #ifdef __FreeBSD__ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct ether_header *eh, struct inpcb *inp) #else pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct ether_header *eh) #endif { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0; struct ip *h = NULL; /* make the compiler happy */ struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, dirndx, pqid = 0; #ifdef __FreeBSD__ PF_LOCK(); #endif if (!pf_status.running) #ifdef __FreeBSD__ { PF_UNLOCK(); #endif return (PF_PASS); #ifdef __FreeBSD__ } #endif memset(&pd, 0, sizeof(pd)); if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: pf_get_mtag returned NULL\n")); return (PF_DROP); } #ifdef __FreeBSD__ if (m->m_flags & M_SKIP_FIREWALL) { PF_UNLOCK(); return (PF_PASS); } #else if (pd.pf_mtag->flags & PF_TAG_GENERATED) return (PF_PASS); #endif #ifdef __FreeBSD__ /* XXX_IMPORT: later */ #else if (ifp->if_type == IFT_CARP && ifp->if_carpdev) ifp = ifp->if_carpdev; #endif kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif return (PF_PASS); } #ifdef __FreeBSD__ M_ASSERTPKTHDR(m); #else #ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("non-M_PKTHDR is passed to pf_test"); #endif /* DIAGNOSTIC */ #endif /* __FreeBSD__ */ if (m->m_pkthdr.len < (int)sizeof(*h)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; h = mtod(m, struct ip *); off = h->ip_hl << 2; if (off < (int)sizeof(*h)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } pd.src = (struct pf_addr *)&h->ip_src; pd.dst = (struct pf_addr *)&h->ip_dst; PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET); pd.ip_sum = &h->ip_sum; pd.proto = h->ip_p; pd.af = AF_INET; pd.tos = h->ip_tos; pd.tot_len = ntohs(h->ip_len); pd.eh = eh; /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); goto done; } switch (h->ip_p) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && pf_check_proto_cksum(m, off, ntohs(h->ip_len) - off, IPPROTO_TCP, AF_INET)) { REASON_SET(&reason, PFRES_PROTCKSUM); action = PF_DROP; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_tcp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL, inp); #else action = pf_test_tcp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ipintrq); #endif break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(m, off, ntohs(h->ip_len) - off, IPPROTO_UDP, AF_INET)) { action = PF_DROP; REASON_SET(&reason, PFRES_PROTCKSUM); goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_udp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL, inp); #else action = pf_test_udp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ipintrq); #endif break; } case IPPROTO_ICMP: { struct icmp ih; pd.hdr.icmp = &ih; if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && pf_check_proto_cksum(m, off, ntohs(h->ip_len) - off, IPPROTO_ICMP, AF_INET)) { action = PF_DROP; REASON_SET(&reason, PFRES_PROTCKSUM); goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_icmp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL); #else action = pf_test_icmp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ipintrq); #endif break; } default: action = pf_test_state_other(&s, dir, kif, &pd); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_other(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL); #else action = pf_test_other(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ipintrq); #endif break; } done: if (action == PF_PASS && h->ip_hl > 5 && !((s && s->allow_opts) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with ip options\n")); } if ((s && s->tag) || r->rtableid) pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid); #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pqid || (pd.tos & IPTOS_LOWDELAY)) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* add hints for ecn */ pd.pf_mtag->af = AF_INET; pd.pf_mtag->hdr = h; } #endif /* ALTQ */ /* * connections redirected to loopback should not match sockets * bound specifically to loopback due to security implications, * see tcp_input() and in_pcblookup_listen(). */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset, &pd); } kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL) { struct pf_addr *x; /* * XXX: we need to make sure that the addresses * passed to pfr_update_stats() are the same than * the addresses used during matching (pfr_match) */ if (r == &pf_default_rule) { tr = nr; x = (s == NULL || s->direction == dir) ? &pd.baddr : &pd.naddr; } else x = (s == NULL || s->direction == dir) ? &pd.naddr : &pd.baddr; if (x == &pd.baddr || s == NULL) { /* we need to change the address */ if (dir == PF_OUT) pd.src = x; else pd.dst = x; } } if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL || s->direction == dir) ? pd.src : pd.dst, pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL || s->direction == dir) ? pd.dst : pd.src, pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } if (action == PF_SYNPROXY_DROP) { m_freem(*m0); *m0 = NULL; action = PF_PASS; } else if (r->rt) /* pf_route can free the mbuf causing *m0 to become NULL */ pf_route(m0, r, dir, ifp, s, &pd); #ifdef __FreeBSD__ PF_UNLOCK(); #endif return (action); } #endif /* INET */ #ifdef INET6 int #ifdef __FreeBSD__ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct ether_header *eh, struct inpcb *inp) #else pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct ether_header *eh) #endif { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0, *n = NULL; struct ip6_hdr *h; struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0; #ifdef __FreeBSD__ PF_LOCK(); #endif if (!pf_status.running) #ifdef __FreeBSD__ { PF_UNLOCK(); #endif return (PF_PASS); #ifdef __FreeBSD__ } #endif memset(&pd, 0, sizeof(pd)); if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: pf_get_mtag returned NULL\n")); return (PF_DROP); } if (pd.pf_mtag->flags & PF_TAG_GENERATED) return (PF_PASS); #ifdef __FreeBSD__ /* XXX_IMPORT: later */ #else if (ifp->if_type == IFT_CARP && ifp->if_carpdev) ifp = ifp->if_carpdev; #endif kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { #ifdef __FreeBSD__ PF_UNLOCK(); #endif return (PF_PASS); } #ifdef __FreeBSD__ M_ASSERTPKTHDR(m); #else #ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("non-M_PKTHDR is passed to pf_test6"); #endif /* DIAGNOSTIC */ #endif #ifdef __FreeBSD__ h = NULL; /* make the compiler happy */ #endif if (m->m_pkthdr.len < (int)sizeof(*h)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; h = mtod(m, struct ip6_hdr *); #if 1 /* * we do not support jumbogram yet. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } #endif pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET6); pd.ip_sum = NULL; pd.af = AF_INET6; pd.tos = 0; pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); pd.eh = eh; off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); pd.proto = h->ip6_nxt; do { switch (pd.proto) { case IPPROTO_FRAGMENT: action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); if (action == PF_DROP) REASON_SET(&reason, PFRES_FRAG); goto done; case IPPROTO_ROUTING: { struct ip6_rthdr rthdr; if (rh_cnt++) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 more than one rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 rthdr0\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } /* fallthrough */ } case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); action = PF_DROP; log = 1; goto done; } if (pd.proto == IPPROTO_AH) off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); /* if there's no routing header, use unmodified mbuf for checksumming */ if (!n) n = m; switch (pd.proto) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && pf_check_proto_cksum(n, off, ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), IPPROTO_TCP, AF_INET6)) { action = PF_DROP; REASON_SET(&reason, PFRES_PROTCKSUM); goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_tcp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL, inp); #else action = pf_test_tcp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ip6intrq); #endif break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(n, off, ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), IPPROTO_UDP, AF_INET6)) { action = PF_DROP; REASON_SET(&reason, PFRES_PROTCKSUM); goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_udp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL, inp); #else action = pf_test_udp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ip6intrq); #endif break; } case IPPROTO_ICMPV6: { struct icmp6_hdr ih; pd.hdr.icmp6 = &ih; if (!pf_pull_hdr(m, off, &ih, sizeof(ih), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (dir == PF_IN && pf_check_proto_cksum(n, off, ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), IPPROTO_ICMPV6, AF_INET6)) { action = PF_DROP; REASON_SET(&reason, PFRES_PROTCKSUM); goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_icmp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL); #else action = pf_test_icmp(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ip6intrq); #endif break; } default: action = pf_test_state_other(&s, dir, kif, &pd); if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); #endif /* NPFSYNC */ r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) #ifdef __FreeBSD__ action = pf_test_other(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, NULL); #else action = pf_test_other(&r, &s, dir, kif, m, off, h, &pd, &a, &ruleset, &ip6intrq); #endif break; } done: /* handle dangerous IPv6 extension headers. */ if (action == PF_PASS && rh_cnt && !((s && s->allow_opts) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with dangerous v6 headers\n")); } if ((s && s->tag) || r->rtableid) pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid); #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.tos & IPTOS_LOWDELAY) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* add hints for ecn */ pd.pf_mtag->af = AF_INET6; pd.pf_mtag->hdr = h; } #endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset, &pd); } kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL) { struct pf_addr *x; /* * XXX: we need to make sure that the addresses * passed to pfr_update_stats() are the same than * the addresses used during matching (pfr_match) */ if (r == &pf_default_rule) { tr = nr; x = (s == NULL || s->direction == dir) ? &pd.baddr : &pd.naddr; } else { x = (s == NULL || s->direction == dir) ? &pd.naddr : &pd.baddr; } if (x == &pd.baddr || s == NULL) { if (dir == PF_OUT) pd.src = x; else pd.dst = x; } } if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL || s->direction == dir) ? pd.src : pd.dst, pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL || s->direction == dir) ? pd.dst : pd.src, pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } if (action == PF_SYNPROXY_DROP) { m_freem(*m0); *m0 = NULL; action = PF_PASS; } else if (r->rt) /* pf_route6 can free the mbuf causing *m0 to become NULL */ pf_route6(m0, r, dir, ifp, s, &pd); #ifdef __FreeBSD__ PF_UNLOCK(); #endif return (action); } #endif /* INET6 */ int pf_check_congestion(struct ifqueue *ifq) { #ifdef __FreeBSD__ /* XXX_IMPORT: later */ return (0); #else if (ifq->ifq_congestion) return (1); else return (0); #endif } Index: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c =================================================================== --- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c (revision 178284) +++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c (revision 178285) @@ -1,945 +1,945 @@ /************************************************************************** Copyright (c) 2007, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); #define TMP_IOV_MAX 16 #ifndef PG_FRAME #define PG_FRAME ~PAGE_MASK #endif #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) void t3_init_socket_ops(void) { struct protosw *prp; prp = pffindtype(AF_INET, SOCK_STREAM); pru_sosend = prp->pr_usrreqs->pru_sosend; pru_soreceive = prp->pr_usrreqs->pru_soreceive; } struct cxgb_dma_info { size_t cdi_mapped; int cdi_nsegs; bus_dma_segment_t *cdi_segs; }; static void cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize, int error) { struct cxgb_dma_info *cdi = arg; cdi->cdi_mapped = mapsize; cdi->cdi_nsegs = nsegs; cdi->cdi_segs = segs; } static void iov_adj(struct iovec **iov, int *iovcnt, size_t count) { struct iovec *iovtmp; int iovcnttmp; caddr_t ptmp; if (count > 0) { iovtmp = *iov; iovcnttmp = *iovcnt; while (count > 0) { if (count < iovtmp->iov_len) { ptmp = iovtmp->iov_base; ptmp += count; iovtmp->iov_base = ptmp; iovtmp->iov_len -= count; break; } else count -= iovtmp->iov_len; iovtmp++; iovcnttmp--; } *iov = iovtmp; *iovcnt = iovcnttmp; } else if (count < 0) { iovtmp = &(*iov)[*iovcnt - 1]; iovcnttmp = *iovcnt; while (count < 0) { if (-count < iovtmp->iov_len) { iovtmp->iov_len += count; break; } else count += iovtmp->iov_len; iovtmp--; iovcnttmp--; } *iovcnt = iovcnttmp; } } static void cxgb_zero_copy_free(void *cl, void *arg) { struct mbuf_vec *mv; struct mbuf *m = (struct mbuf *)cl; mv = mtomv(m); /* * Physical addresses, don't try to free should be unheld separately from sbdrop * */ mv->mv_count = 0; m_free_iovec(m, m->m_type); } static int cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) { struct iovec *iov = uio->uio_iov; int iovcnt = uio->uio_iovcnt; int err, i, count, totcount, maxcount, totbytes, npages, curbytes; uint64_t start, end; vm_page_t *mp; totbytes = totcount = 0; maxcount = *held; mp = m; for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { count = maxcount - totcount; start = (uintptr_t)iov->iov_base; end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len); start &= PG_FRAME; end += PAGE_MASK; end &= PG_FRAME; npages = (end - start) >> PAGE_SHIFT; count = min(count, npages); err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags); if (err) { vm_fault_unhold_pages(m, totcount); return (err); } mp += count; totcount += count; curbytes = iov->iov_len; if (count != npages) curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK); totbytes += curbytes; } uio->uio_resid -= totbytes; return (0); } /* * Returns whether a connection should enable DDP. This happens when all of * the following conditions are met: * - the connection's ULP mode is DDP * - DDP is not already enabled * - the last receive was above the DDP threshold * - receive buffers are in user space * - receive side isn't shutdown (handled by caller) * - the connection's receive window is big enough so that sizable buffers * can be posted without closing the window in the middle of DDP (checked * when the connection is offloaded) */ static int so_should_ddp(const struct toepcb *toep, int last_recv_len) { DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n", toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres), toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN)); return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) && last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && toep->tp_tp->rcv_wnd > (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); } static inline int is_ddp(const struct mbuf *m) { return (m->m_flags & M_DDP); } static inline int is_ddp_psh(const struct mbuf *m) { return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); } static int m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) { int curlen, startlen, resid_init, err = 0; caddr_t buf; DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n", m, offset, len); startlen = len; resid_init = uio->uio_resid; while (m && len) { buf = mtod(m, caddr_t); curlen = m->m_len; if (offset && (offset < curlen)) { curlen -= offset; buf += offset; offset = 0; } else if (offset) { offset -= curlen; m = m->m_next; continue; } err = uiomove(buf, min(len, curlen), uio); if (err) { printf("uiomove returned %d\n", err); return (err); } len -= min(len, curlen); m = m->m_next; } DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n", startlen - len, resid_init, uio->uio_resid); return (err); } /* * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a * DDP buffer. */ static inline int copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) { struct iovec *to = uio->uio_iov; int err; if (__predict_true(!is_ddp(m))) { /* RX_DATA */ return m_uiomove(m, offset, len, uio); } if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ to->iov_len -= len; to->iov_base = ((caddr_t)to->iov_base) + len; uio->uio_iov = to; uio->uio_resid -= len; return (0); } err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ return (err); } static void cxgb_wait_dma_completion(struct toepcb *toep) { - struct mtx *lock; + struct rwlock *lock; - lock = &toep->tp_tp->t_inpcb->inp_mtx; + lock = &toep->tp_tp->t_inpcb->inp_lock; inp_wlock(toep->tp_tp->t_inpcb); cv_wait_unlock(&toep->tp_cv, lock); } static int cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) { int i, seg_count, err, type; struct mbuf *m0; struct cxgb_dma_info cdi; struct mbuf_vec *mv; struct mbuf_iovec *mi; bus_dma_segment_t *segs; err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio, cxgb_dma_callback, &cdi, 0); if (err) return (err); seg_count = cdi.cdi_nsegs; if ((m0 = mcl_alloc(seg_count, &type)) == NULL) { bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap); return (ENOMEM); } segs = cdi.cdi_segs; m0->m_type = type; m0->m_flags = (M_EXT|M_NOFREE); m0->m_ext.ext_type = EXT_EXTREF; m0->m_ext.ext_free = cxgb_zero_copy_free; m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */ m0->m_ext.ext_arg2 = NULL; mv = mtomv(m0); mv->mv_count = seg_count; mv->mv_first = 0; for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++) mi_collapse_sge(mi, segs); *m = m0; /* * This appears to be a no-op at the moment * as busdma is all or nothing need to make * sure the tag values are large enough * */ if (cdi.cdi_mapped < uio->uio_resid) { uio->uio_resid -= cdi.cdi_mapped; } else uio->uio_resid = 0; return (0); } static int t3_sosend(struct socket *so, struct uio *uio) { int rv, count, hold_resid, sent, iovcnt; struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m; struct uio uiotmp; /* * Events requiring iteration: * - number of pages exceeds max hold pages for process or system * - number of pages exceeds maximum sg entries for a single WR * * We're limited to holding 128 pages at once - and we're limited to * 34 SG entries per work request, but each SG entry can be any number * of contiguous pages * */ uiotmp = *uio; iovcnt = uio->uio_iovcnt; iov = uio->uio_iov; sent = 0; sendmore: /* * Make sure we don't exceed the socket buffer */ count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE); rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0); hold_resid = uiotmp.uio_resid; if (rv) return (rv); /* * Bump past sent and shave off the unheld amount */ if (hold_resid > 0) { iovtmpp = iovtmp; memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); if (sent) iov_adj(&iovtmpp, &iovcnt, sent); iov_adj(&iovtmpp, &iovcnt, -hold_resid); uiotmp.uio_iov = iovtmpp; uiotmp.uio_iovcnt = iovcnt; } uiotmp.uio_resid = uio->uio_resid - hold_resid; /* * Push off all held pages * */ while (uiotmp.uio_resid > 0) { rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m); if (rv) { vm_fault_unhold_pages(toep->tp_pages, count); return (rv); } uio->uio_resid -= m->m_pkthdr.len; sent += m->m_pkthdr.len; sbappend(&so->so_snd, m); t3_push_frames(so, TRUE); iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); } /* * Wait for pending I/O to be DMA'd to the card * */ cxgb_wait_dma_completion(toep); vm_fault_unhold_pages(toep->tp_pages, count); /* * If there is more data to send adjust local copy of iov * to point to teh start */ if (hold_resid) { iovtmpp = iovtmp; memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); iov_adj(&iovtmpp, &iovcnt, sent); uiotmp = *uio; uiotmp.uio_iov = iovtmpp; uiotmp.uio_iovcnt = iovcnt; goto sendmore; } return (0); } static int cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { struct tcpcb *tp = sototcpcb(so); struct toedev *tdev; int zcopy_thres, zcopy_enabled, rv; /* * In order to use DMA direct from userspace the following * conditions must be met: * - the connection is currently offloaded * - ddp is enabled * - the number of bytes to be transferred exceeds the threshold * - the number of bytes currently in flight won't exceed the in-flight * threshold XXX TODO * - vm_fault_hold_user_pages succeeds * - blocking socket XXX for now * */ if (tp->t_flags & TF_TOE) { tdev = TOE_DEV(so); zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); if (uio && (uio->uio_resid > zcopy_thres) && (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0) && zcopy_enabled) { rv = t3_sosend(so, uio); if (rv != EAGAIN) return (rv); } } return pru_sosend(so, addr, uio, top, control, flags, td); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } #define IS_NONBLOCKING(so) ((so)->so_state & SS_NBIO) static int t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m; uint32_t offset; int err, flags, avail, len, copied, copied_unacked; int target; /* Read at least this many bytes */ int user_ddp_ok; struct ddp_state *p; struct inpcb *inp = sotoinpcb(so); avail = offset = copied = copied_unacked = 0; flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; err = sblock(&so->so_rcv, SBLOCKWAIT(flags)); p = &toep->tp_ddp_state; if (err) return (err); SOCKBUF_LOCK(&so->so_rcv); p->user_ddp_pending = 0; restart: len = uio->uio_resid; m = so->so_rcv.sb_mb; target = (flags & MSG_WAITALL) ? len : so->so_rcv.sb_lowat; user_ddp_ok = p->ubuf_ddp_ready; p->cancel_ubuf = 0; if (len == 0) goto done; #if 0 while (m && m->m_len == 0) { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } #endif if (m) goto got_mbuf; /* empty receive queue */ if (copied >= target && (so->so_rcv.sb_mb == NULL) && !p->user_ddp_pending) goto done; if (copied) { if (so->so_error || tp->t_state == TCPS_CLOSED || (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) goto done; } else { if (so->so_state & SS_NOFDREF) goto done; if (so->so_error) { err = so->so_error; so->so_error = 0; goto done; } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) goto done; if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) goto done; if (tp->t_state == TCPS_CLOSED) { err = ENOTCONN; goto done; } } if (so->so_rcv.sb_mb && !p->user_ddp_pending) { SOCKBUF_UNLOCK(&so->so_rcv); inp_wlock(inp); t3_cleanup_rbuf(tp, copied_unacked); inp_wunlock(inp); SOCKBUF_LOCK(&so->so_rcv); copied_unacked = 0; goto restart; } if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && p->ubuf_ddp_ready) { p->user_ddp_pending = !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); if (p->user_ddp_pending) { p->kbuf_posted++; user_ddp_ok = 0; } } if (p->kbuf[0] && (p->kbuf_posted == 0)) { t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); p->kbuf_posted++; } if (p->user_ddp_pending) { /* One shot at DDP if we already have enough data */ if (copied >= target) user_ddp_ok = 0; DPRINTF("sbwaiting 1\n"); if ((err = sbwait(&so->so_rcv)) != 0) goto done; //for timers to work await_ddp_completion(sk, flags, &timeo); } else if (copied >= target) goto done; else { if (copied_unacked) { int i = 0; SOCKBUF_UNLOCK(&so->so_rcv); inp_wlock(inp); t3_cleanup_rbuf(tp, copied_unacked); inp_wunlock(inp); copied_unacked = 0; if (mp_ncpus > 1) while (i++ < 200 && so->so_rcv.sb_mb == NULL) cpu_spinwait(); SOCKBUF_LOCK(&so->so_rcv); } if (so->so_rcv.sb_mb) goto restart; DPRINTF("sbwaiting 2 copied=%d target=%d avail=%d so=%p mb=%p cc=%d\n", copied, target, avail, so, so->so_rcv.sb_mb, so->so_rcv.sb_cc); if ((err = sbwait(&so->so_rcv)) != 0) goto done; } goto restart; got_mbuf: KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len, m->m_pkthdr.len)); KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len)); if (m->m_pkthdr.len == 0) { if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) panic("empty mbuf and NOCOPY not set\n"); CTR0(KTR_TOM, "ddp done notification"); p->user_ddp_pending = 0; sbdroprecord_locked(&so->so_rcv); goto done; } offset = toep->tp_copied_seq + copied_unacked - m->m_seq; DPRINTF("m=%p copied_seq=0x%x copied_unacked=%d m_seq=0x%x offset=%d pktlen=%d is_ddp(m)=%d\n", m, toep->tp_copied_seq, copied_unacked, m->m_seq, offset, m->m_pkthdr.len, !!is_ddp(m)); if (offset >= m->m_pkthdr.len) panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x seq 0x%x " "pktlen %d ddp flags 0x%x", offset, toep->tp_copied_seq + copied_unacked, m->m_seq, m->m_pkthdr.len, m->m_ddp_flags); avail = m->m_pkthdr.len - offset; if (len < avail) { if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset); avail = len; } CTR4(KTR_TOM, "t3_soreceive: m_len=%u offset=%u len=%u m_seq=0%08x", m->m_pkthdr.len, offset, len, m->m_seq); #ifdef URGENT_DATA_SUPPORTED /* * Check if the data we are preparing to copy contains urgent * data. Either stop short of urgent data or skip it if it's * first and we are not delivering urgent data inline. */ if (__predict_false(toep->tp_urg_data)) { uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked; if (urg_offset < avail) { if (urg_offset) { /* stop short of the urgent data */ avail = urg_offset; } else if ((so->so_options & SO_OOBINLINE) == 0) { /* First byte is urgent, skip */ toep->tp_copied_seq++; offset++; avail--; if (!avail) goto skip_copy; } } } #endif if (is_ddp_psh(m) || offset) { user_ddp_ok = 0; #ifdef T3_TRACE T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); #endif } if (user_ddp_ok && !p->user_ddp_pending && uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && p->ubuf_ddp_ready) { p->user_ddp_pending = !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); if (p->user_ddp_pending) { p->kbuf_posted++; user_ddp_ok = 0; } DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending); } else DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n", user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0, p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted); /* * If MSG_TRUNC is specified the data is discarded. * XXX need to check pr_atomic */ KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset)); if (__predict_true(!(flags & MSG_TRUNC))) { int resid = uio->uio_resid; SOCKBUF_UNLOCK(&so->so_rcv); if ((err = copy_data(m, offset, avail, uio))) { if (err) err = EFAULT; goto done_unlocked; } SOCKBUF_LOCK(&so->so_rcv); if (avail != (resid - uio->uio_resid)) printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n", avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m)); } copied += avail; copied_unacked += avail; len -= avail; #ifdef URGENT_DATA_SUPPORTED skip_copy: if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq)) tp->urg_data = 0; #endif /* * If the buffer is fully consumed free it. If it's a DDP * buffer also handle any events it indicates. */ if (avail + offset >= m->m_pkthdr.len) { unsigned int fl = m->m_ddp_flags; int exitnow, got_psh = 0, nomoredata = 0; int count; struct mbuf *nextrecord; if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) { if (is_ddp_psh(m) && p->user_ddp_pending) got_psh = 1; if (fl & DDP_BF_NOCOPY) p->user_ddp_pending = 0; else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) { p->kbuf_posted--; nomoredata = 1; } else { p->kbuf_posted--; p->ubuf_ddp_ready = 1; } } nextrecord = m->m_nextpkt; count = m->m_pkthdr.len; while (count > 0) { count -= m->m_len; KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); #if 0 sbdrop_locked(&so->so_rcv, m->m_pkthdr.len); #endif exitnow = got_psh || nomoredata; if (copied >= target && (so->so_rcv.sb_mb == NULL) && exitnow) goto done; if (copied_unacked > (so->so_rcv.sb_hiwat >> 2)) { SOCKBUF_UNLOCK(&so->so_rcv); inp_wlock(inp); t3_cleanup_rbuf(tp, copied_unacked); inp_wunlock(inp); copied_unacked = 0; SOCKBUF_LOCK(&so->so_rcv); } } if (len > 0) goto restart; done: /* * If we can still receive decide what to do in preparation for the * next receive. Note that RCV_SHUTDOWN is set if the connection * transitioned to CLOSE but not if it was in that state to begin with. */ if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { if (p->user_ddp_pending) { SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_rcv); user_ddp_ok = 0; t3_cancel_ubuf(toep); if (so->so_rcv.sb_mb) { if (copied < 0) copied = 0; if (len > 0) goto restart; } p->user_ddp_pending = 0; } if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) { #ifdef T3_TRACE T3_TRACE0(TIDTB(so), "chelsio_recvmsg: about to exit, repost kbuf"); #endif t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); p->kbuf_posted++; } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) { CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid); if (!t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so), ddp_copy_limit), 0, IS_NONBLOCKING(so))) p->kbuf_posted = 1; } } #ifdef T3_TRACE T3_TRACE5(TIDTB(so), "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " "kbuf_posted %d user_ddp_pending %u", copied, len, buffers_freed, p ? p->kbuf_posted : -1, p->user_ddp_pending); #endif SOCKBUF_UNLOCK(&so->so_rcv); done_unlocked: if (copied_unacked) { inp_wlock(inp); t3_cleanup_rbuf(tp, copied_unacked); inp_wunlock(inp); } sbunlock(&so->so_rcv); return (err); } static int cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct toedev *tdev; int rv, zcopy_thres, zcopy_enabled, flags; struct tcpcb *tp = sototcpcb(so); flags = flagsp ? *flagsp &~ MSG_EOR : 0; /* * In order to use DMA direct from userspace the following * conditions must be met: * - the connection is currently offloaded * - ddp is enabled * - the number of bytes to be transferred exceeds the threshold * - the number of bytes currently in flight won't exceed the in-flight * threshold XXX TODO * - vm_fault_hold_user_pages succeeds * - blocking socket XXX for now * - iovcnt is 1 * */ if ((tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) && (uio->uio_iovcnt == 1) && (mp0 == NULL)) { tdev = TOE_DEV(so); zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); zcopy_enabled = TOM_TUNABLE(tdev, ddp); if ((uio->uio_resid > zcopy_thres) && (uio->uio_iovcnt == 1) && zcopy_enabled) { rv = t3_soreceive(so, flagsp, uio); if (rv != EAGAIN) return (rv); else printf("returned EAGAIN\n"); } } else if ((tp->t_flags & TF_TOE) && uio && mp0 == NULL) printf("skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n", flags, uio->uio_iovcnt, so->so_rcv.sb_state); return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); } void t3_install_socket_ops(struct socket *so) { so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; } Index: head/sys/kern/subr_witness.c =================================================================== --- head/sys/kern/subr_witness.c (revision 178284) +++ head/sys/kern/subr_witness.c (revision 178285) @@ -1,2059 +1,2059 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif /* Easier to stay with the old names. */ #define lo_list lo_witness_data.lod_list #define lo_witness lo_witness_data.lod_witness #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ /* Define this to check for blessed mutexes */ #undef BLESSING #define WITNESS_COUNT 1024 #define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4) /* * XXX: This is somewhat bogus, as we assume here that at most 1024 threads * will hold LOCK_NCHILDREN * 2 locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2 #define WITNESS_NCHILDREN 6 #define LOCK_NCHILDREN 3 struct witness_child_list_entry; /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; /* Recursion count and LI_* flags. */ }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; struct witness { const char *w_name; struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness_child_list_entry *w_children; /* Great evilness... */ const char *w_file; int w_line; u_int w_level; u_int w_refcount; u_char w_Giant_squawked:1; u_char w_other_squawked:1; u_char w_same_squawked:1; u_char w_displayed:1; }; struct witness_child_list_entry { struct witness_child_list_entry *wcl_next; struct witness *wcl_children[WITNESS_NCHILDREN]; u_int wcl_count; }; STAILQ_HEAD(witness_list, witness); #ifdef BLESSING struct witness_blessed { const char *b_lock1; const char *b_lock2; }; #endif struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; #ifdef BLESSING static int blessed(struct witness *, struct witness *); #endif static int depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static int insertchild(struct witness *parent, struct witness *child); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static int itismychild(struct witness *parent, struct witness *child); static void removechild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static const char *fixup_filename(const char *file); static struct witness *witness_get(void); static void witness_free(struct witness *m); static struct witness_child_list_entry *witness_child_get(void); static void witness_child_free(struct witness_child_list_entry *wcl); static struct lock_list_entry *witness_lock_list_get(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_instance *find_instance(struct lock_list_entry *lock_list, struct lock_object *lock); static void witness_list_lock(struct lock_instance *instance); #ifdef DDB static void witness_leveldescendents(struct witness *parent, int level); static void witness_levelall(void); static void witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *, int indent); static void witness_display_list(void(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_display(void(*)(const char *fmt, ...)); static void witness_list(struct thread *td); #endif SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, 0, "Witness Locking"); /* * If set to 0, witness is disabled. If set to a non-zero value, witness * performs full lock order checking for all locks. At runtime, this * value may be set to 0 to turn off witness. witness is not allowed be * turned on once it is turned off, however. */ static int witness_watch = 1; TUNABLE_INT("debug.witness.watch", &witness_watch); SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RW | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is set to 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif TUNABLE_INT("debug.witness.kdb", &witness_kdb); SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RW, &witness_kdb, 0, ""); /* * When KDB is enabled and witness_trace is set to 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; TUNABLE_INT("debug.witness.trace", &witness_trace); SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RW, &witness_trace, 0, ""); #endif /* KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif TUNABLE_INT("debug.witness.skipspin", &witness_skipspin); SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); static struct mtx w_mtx; static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); static struct witness_child_list_entry *w_child_free = NULL; static struct lock_list_entry *w_lock_list_free = NULL; static int w_free_cnt, w_spin_cnt, w_sleep_cnt, w_child_free_cnt, w_child_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, child_free_cnt, CTLFLAG_RD, &w_child_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, child_cnt, CTLFLAG_RD, &w_child_cnt, 0, ""); static struct witness w_data[WITNESS_COUNT]; static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT]; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_mtx_sleep }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Multicast - protocol locks before interface locks, after UDP locks. */ - { "udpinp", &lock_class_mtx_sleep }, + { "udpinp", &lock_class_rw }, { "in_multi_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "if_addr_mtx", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ - { "udp", &lock_class_mtx_sleep }, - { "udpinp", &lock_class_mtx_sleep }, + { "udp", &lock_class_rw }, + { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ - { "tcp", &lock_class_mtx_sleep }, - { "tcpinp", &lock_class_mtx_sleep }, + { "tcp", &lock_class_rw }, + { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * SLIP */ { "slip_mtx", &lock_class_mtx_sleep }, { "slip sc_mtx", &lock_class_mtx_sleep }, { NULL, NULL }, /* * netatalk */ { "ddp_list_mtx", &lock_class_mtx_sleep }, { "ddp_mtx", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_mtx_sleep }, { "bpf interface lock", &lock_class_mtx_sleep }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "system map", &lock_class_mtx_sleep }, { "vm page queue mutex", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, { "sio", &lock_class_mtx_spin }, { "scrlock", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif #ifdef __sparc64__ { "pcib_mtx", &lock_class_mtx_spin }, { "rtc_mtx", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { "process slock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "umtx lock", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "time lock", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #if defined(SMP) && defined(__sparc64__) { "ipi", &lock_class_mtx_spin }, #endif #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #ifdef __ia64__ { "MCA spin lock", &lock_class_mtx_spin }, #endif #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; #ifdef BLESSING /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed); #endif /* * List of locks initialized prior to witness being initialized whose * enrollment is currently deferred. */ STAILQ_HEAD(, lock_object) pending_locks = STAILQ_HEAD_INITIALIZER(pending_locks); /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ static void witness_initialize(void *dummy __unused) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; int i; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = 0; i < WITNESS_COUNT; i++) witness_free(&w_data[i]); for (i = 0; i < WITNESS_CHILDCOUNT; i++) witness_child_free(&w_childdata[i]); for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; if (!itismychild(w, w1)) panic("Not enough memory for static orders!"); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ while (!STAILQ_EMPTY(&pending_locks)) { lock = STAILQ_FIRST(&pending_locks); STAILQ_REMOVE_HEAD(&pending_locks, lo_list); KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(lock->lo_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value == witness_watch) return (0); if (value != 0) return (EINVAL); witness_watch = 0; return (0); } void witness_init(struct lock_object *lock) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch == 0 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { STAILQ_INSERT_TAIL(&pending_locks, lock, lo_list); lock->lo_flags |= LO_ENROLLPEND; } else lock->lo_witness = enroll(lock->lo_type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & (LO_WITNESS | LO_ENROLLPEND)) == LO_WITNESS && lock->lo_witness != NULL) { w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; /* * Lock is already released if we have an allocation failure * and depart() fails. */ if (w->w_refcount != 0 || depart(w)) mtx_unlock_spin(&w_mtx); } /* * If this lock is destroyed before witness is up and running, * remove it from the pending list. */ if (lock->lo_flags & LO_ENROLLPEND) { STAILQ_REMOVE(&pending_locks, lock, lock_object, lo_list); lock->lo_flags &= ~LO_ENROLLPEND; } } #ifdef DDB static void witness_levelall (void) { struct witness_list *list; struct witness *w, *w1; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) { w->w_level = 0; } /* * Look for locks with no parent and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* * This is just an optimization, technically we could get * away just walking the all list each time. */ if (w->w_class->lc_flags & LC_SLEEPLOCK) list = &w_sleep; else list = &w_spin; STAILQ_FOREACH(w1, list, w_typelist) { if (isitmychild(w1, w)) goto skip; } witness_leveldescendents(w, 0); skip: ; /* silence GCC 3.x */ } } static void witness_leveldescendents(struct witness *parent, int level) { struct witness_child_list_entry *wcl; int i; if (parent->w_level < level) parent->w_level = level; level++; for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) for (i = 0; i < wcl->wcl_count; i++) witness_leveldescendents(wcl->wcl_children[i], level); } static void witness_displaydescendants(void(*prnt)(const char *fmt, ...), struct witness *parent, int indent) { struct witness_child_list_entry *wcl; int i, level; level = parent->w_level; prnt("%-2d", level); for (i = 0; i < indent; i++) prnt(" "); if (parent->w_refcount > 0) prnt("%s", parent->w_name); else prnt("(dead)"); if (parent->w_displayed) { prnt(" -- (already displayed)\n"); return; } parent->w_displayed = 1; if (parent->w_refcount > 0) { if (parent->w_file != NULL) prnt(" -- last acquired @ %s:%d", parent->w_file, parent->w_line); } prnt("\n"); for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) for (i = 0; i < wcl->wcl_count; i++) witness_displaydescendants(prnt, wcl->wcl_children[i], indent + 1); } static void witness_display_list(void(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_level > 0) continue; /* * This lock has no anscestors, display its descendants. */ witness_displaydescendants(prnt, w, 0); } } static void witness_display(void(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); witness_levelall(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) { w->w_displayed = 0; } /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_display_list(prnt, &w_sleep); /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_display_list(prnt, &w_spin); /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s\n", w->w_name); } } #endif /* DDB */ /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == 0 || panicstr != NULL) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); MPASS(!mtx_owned(&w_mtx)); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_type, lock1->lo_type); if (!itismychild(lock1->lo_witness, lock2->lo_witness)) return (ENOMEM); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *lock1, *lock2; struct lock_class *class; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch == 0 || lock->lo_witness == NULL || panicstr != NULL) return; /* * Try locks do not block if they fail to acquire the lock, thus * there is no danger of deadlocks or of switching while holding a * spin lock if we acquire a lock via a try operation. This * function shouldn't even be called for try locks, so panic if * that happens. */ if (flags & LOP_TRYLOCK) panic("%s should not be called for try lock operations", __func__); w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; file = fixup_filename(file); if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) panic("blockable sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ if (td->td_sleeplocks == NULL) return; lock_list = &td->td_sleeplocks; } else { /* * If this is the first lock, just return as no order * checking is needed. We check this in both if clauses * here as unifying the check would require us to use a * critical section to ensure we don't migrate while doing * the check. Note that if this is not the first lock, we * are already in a critical section and are safe for the * rest of the check. */ if (PCPU_GET(spinlocks) == NULL) return; lock_list = PCPU_PTR(spinlocks); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(*lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { printf("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, file, line); printf("while exclusively locked from %s:%d\n", lock1->li_file, lock1->li_line); panic("share->excl"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { printf("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, file, line); printf("while share locked from %s:%d\n", lock1->li_file, lock1->li_line); panic("excl->share"); } return; } /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; w1 = lock1->li_lock->lo_witness; if (w1 == w) { if (w->w_same_squawked || (lock->lo_flags & LO_DUPOK) || (flags & LOP_DUPOK)) return; w->w_same_squawked = 1; printf("acquiring duplicate lock of same type: \"%s\"\n", lock->lo_type); printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name, lock1->li_file, lock1->li_line); printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line); #ifdef KDB goto debugger; #else return; #endif } MPASS(!mtx_owned(&w_mtx)); mtx_lock_spin(&w_mtx); /* * If we know that the the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) { mtx_unlock_spin(&w_mtx); return; } for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { MPASS(j < WITNESS_COUNT); lock1 = &lle->ll_children[i]; w1 = lock1->li_lock->lo_witness; /* * If this lock doesn't undergo witness checking, * then skip it. */ if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ mtx_unlock_spin(&w_mtx); #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't * look for other lock order violations though, which * may be a bug. */ if (blessed(w, w1)) return; #endif if (lock1->li_lock == &Giant.lock_object) { if (w1->w_Giant_squawked) return; else w1->w_Giant_squawked = 1; } else { if (w1->w_other_squawked) return; else w1->w_other_squawked = 1; } /* * Ok, yell about it. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) printf( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) printf( "lock order reversal: (Giant after non-sleepable)\n"); else printf("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { printf(" 1st %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, lock1->li_lock->lo_type, lock1->li_file, lock1->li_line); printf(" 2nd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, lock->lo_type, file, line); } else { printf(" 1st %p %s (%s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, lock2->li_lock->lo_type, lock2->li_file, lock2->li_line); printf(" 2nd %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, lock1->li_lock->lo_type, lock1->li_file, lock1->li_line); printf(" 3rd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, lock->lo_type, file, line); } #ifdef KDB goto debugger; #else return; #endif } } lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(lock1->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock->lo_type, lock1->li_lock->lo_type); if (!itismychild(lock1->li_lock->lo_witness, w)) /* Witness is dead. */ return; } mtx_unlock_spin(&w_mtx); return; #ifdef KDB debugger: if (witness_trace) kdb_backtrace(); if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, __func__); #endif } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == 0 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; td = curthread; file = fixup_filename(file); /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags = LI_EXCLUSIVE; else instance->li_flags = 0; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); file = fixup_filename(file); if ((lock->lo_flags & LO_UPGRADABLE) == 0) panic("upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) panic("upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((instance->li_flags & LI_EXCLUSIVE) != 0) panic("upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((instance->li_flags & LI_RECURSEMASK) != 0) panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, file, line); instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); file = fixup_filename(file); if ((lock->lo_flags & LO_UPGRADABLE) == 0) panic("downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) panic("downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((instance->li_flags & LI_EXCLUSIVE) == 0) panic("downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, file, line); if ((instance->li_flags & LI_RECURSEMASK) != 0) panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, file, line); instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || witness_watch == 0 || lock->lo_witness == NULL || panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); file = fixup_filename(file); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, file, line); found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { printf("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, file, line); printf("while exclusively locked from %s:%d\n", instance->li_file, instance->li_line); panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { printf("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, file, line); printf("while share locked from %s:%d\n", instance->li_file, instance->li_line); panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* If this lock list entry is now empty, free it. */ if ((*lock_list)->ll_count == 0) { lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * console along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch == 0 || panicstr != NULL) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following"); if (flags & WARN_SLEEPOK) printf(" non-sleepable"); printf(" locks held:\n"); } n++; witness_list_lock(lock1); } if (PCPU_GET(spinlocks) != NULL) { /* * Since we already hold a spinlock preemption is * already blocked. */ if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following"); if (flags & WARN_SLEEPOK) printf(" non-sleepable"); printf(" locks held:\n"); } n += witness_list_locks(PCPU_PTR(spinlocks)); } if (flags & WARN_PANIC && n) panic("witness_warn"); #ifdef KDB else if (witness_kdb && n) kdb_enter(KDB_WHY_WITNESS, __func__); else if (witness_trace && n) kdb_backtrace(); #endif return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch == 0 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch == 0 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; if (witness_watch == 0 || panicstr != NULL) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin) return (NULL); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_name == description || (w->w_refcount > 0 && strcmp(description, w->w_name) == 0)) { w->w_refcount++; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } } if ((w = witness_get()) == NULL) goto out; w->w_name = description; w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } else { mtx_unlock_spin(&w_mtx); panic("lock class %s is not sleep or spin", lock_class->lc_name); } mtx_unlock_spin(&w_mtx); out: /* * We issue a warning for any spin locks not defined in the static * order list as a way to discourage their use (folks should really * be using non-spin mutexes most of the time). However, several * 3rd part device drivers use spin locks because that is all they * have available on Windows and Linux and they think that normal * mutexes are insufficient. */ if ((lock_class->lc_flags & LC_SPINLOCK) && witness_spin_warn) printf("WITNESS: spin lock %s not in order list\n", description); return (w); } /* Don't let the door bang you on the way out... */ static int depart(struct witness *w) { struct witness_child_list_entry *wcl, *nwcl; struct witness_list *list; struct witness *parent; MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { list = &w_sleep; w_sleep_cnt--; } else { list = &w_spin; w_spin_cnt--; } /* * First, we run through the entire tree looking for any * witnesses that the outgoing witness is a child of. For * each parent that we find, we reparent all the direct * children of the outgoing witness to its parent. */ STAILQ_FOREACH(parent, list, w_typelist) { if (!isitmychild(parent, w)) continue; removechild(parent, w); } /* * Now we go through and free up the child list of the * outgoing witness. */ for (wcl = w->w_children; wcl != NULL; wcl = nwcl) { nwcl = wcl->wcl_next; w_child_cnt--; witness_child_free(wcl); } /* * Detach from various lists and free. */ STAILQ_REMOVE(list, w, witness, w_typelist); STAILQ_REMOVE(&w_all, w, witness, w_list); witness_free(w); return (1); } /* * Add "child" as a direct child of "parent". Returns false if * we fail due to out of memory. */ static int insertchild(struct witness *parent, struct witness *child) { struct witness_child_list_entry **wcl; MPASS(child != NULL && parent != NULL); /* * Insert "child" after "parent" */ wcl = &parent->w_children; while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN) wcl = &(*wcl)->wcl_next; if (*wcl == NULL) { *wcl = witness_child_get(); if (*wcl == NULL) return (0); w_child_cnt++; } (*wcl)->wcl_children[(*wcl)->wcl_count++] = child; return (1); } static int itismychild(struct witness *parent, struct witness *child) { struct witness_list *list; MPASS(child != NULL && parent != NULL); if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) != (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))) panic( "%s: parent (%s) and child (%s) are not the same lock type", __func__, parent->w_class->lc_name, child->w_class->lc_name); if (!insertchild(parent, child)) return (0); if (parent->w_class->lc_flags & LC_SLEEPLOCK) list = &w_sleep; else list = &w_spin; return (1); } static void removechild(struct witness *parent, struct witness *child) { struct witness_child_list_entry **wcl, *wcl1; int i; for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next) for (i = 0; i < (*wcl)->wcl_count; i++) if ((*wcl)->wcl_children[i] == child) goto found; return; found: (*wcl)->wcl_count--; if ((*wcl)->wcl_count > i) (*wcl)->wcl_children[i] = (*wcl)->wcl_children[(*wcl)->wcl_count]; MPASS((*wcl)->wcl_children[i] != NULL); if ((*wcl)->wcl_count != 0) return; wcl1 = *wcl; *wcl = wcl1->wcl_next; w_child_cnt--; witness_child_free(wcl1); } static int isitmychild(struct witness *parent, struct witness *child) { struct witness_child_list_entry *wcl; int i; for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { for (i = 0; i < wcl->wcl_count; i++) { if (wcl->wcl_children[i] == child) return (1); } } return (0); } static int isitmydescendant(struct witness *parent, struct witness *child) { struct witness_child_list_entry *wcl; int i, j; if (isitmychild(parent, child)) return (1); j = 0; for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { MPASS(j < 1000); for (i = 0; i < wcl->wcl_count; i++) { if (isitmydescendant(wcl->wcl_children[i], child)) return (1); } j++; } return (0); } #ifdef BLESSING static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < blessed_count; i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } #endif static struct witness * witness_get(void) { struct witness *w; if (witness_watch == 0) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = 0; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; bzero(w, sizeof(*w)); return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct witness_child_list_entry * witness_child_get(void) { struct witness_child_list_entry *wcl; if (witness_watch == 0) { mtx_unlock_spin(&w_mtx); return (NULL); } wcl = w_child_free; if (wcl == NULL) { witness_watch = 0; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_child_free = wcl->wcl_next; w_child_free_cnt--; bzero(wcl, sizeof(*wcl)); return (wcl); } static void witness_child_free(struct witness_child_list_entry *wcl) { wcl->wcl_next = w_child_free; w_child_free = wcl; w_child_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == 0) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = 0; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *lock_list, struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance) { struct lock_object *lock; lock = instance->li_lock; printf("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_type != lock->lo_name) printf(" (%s)", lock->lo_type); printf(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, instance->li_file, instance->li_line); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { return (td->td_sleeplocks != NULL); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i]); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; instance->li_file = file; instance->li_line = line; } void witness_assert(struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); } file = fixup_filename(file); switch (flags) { case LA_UNLOCKED: if (instance != NULL) panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, file, line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, file, line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) panic("Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, file, line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) panic("Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, file, line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, file, line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, file, line); break; default: panic("Invalid lock assertion at %s:%d.", file, line); } #endif /* INVARIANT_SUPPORT */ } #ifdef DDB static void witness_list(struct thread *td) { KASSERT(!witness_cold, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch == 0) return; witness_list_locks(&td->td_sleeplocks); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks)); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, TRUE); else td = kdb_thread; witness_list(td); } DB_SHOW_COMMAND(alllocks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, td->td_name, td, td->td_tid); witness_list(td); } } } DB_SHOW_COMMAND(witness, db_witness_display) { witness_display(db_printf); } #endif Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 178284) +++ head/sys/netinet/in_mcast.c (revision 178285) @@ -1,1822 +1,1822 @@ /*- * Copyright (c) 2007 Bruce M. Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. * Until further notice, this file requires INET to compile. * TODO: Make this infrastructure independent of address family. * TODO: Teach netinet6 to use this code. * TODO: Hook up SSM logic to IGMPv3/MLDv2. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; #ifdef INET6 struct sockaddr_in6 sin6; #endif }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "in_msource", "IPv4 multicast source filter"); /* * The IPv4 multicast list (in_multihead and associated structures) are * protected by the global in_multi_mtx. See in_var.h for more details. For * now, in_multi_mtx is marked as recursible due to IGMP's calling back into * ip_output() to send IGMP packets while holding the lock; this probably is * not quite desirable. */ struct in_multihead in_multihead; /* XXX BSS initialization */ struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_match_group() * imo_match_source() * in_addmulti() * in_delmulti() * in_delmulti_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() */ static int imo_grow(struct ip_moptions *); static int imo_join_source(struct ip_moptions *, size_t, sockunion_t *); static int imo_leave_source(struct ip_moptions *, size_t, sockunion_t *); static int inp_change_source_filter(struct inpcb *, struct sockopt *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_IPMSOURCE, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { nmfilters[idx].imf_fmode = MCAST_EXCLUDE; nmfilters[idx].imf_nsources = 0; TAILQ_INIT(&nmfilters[idx].imf_sources); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_IPMSOURCE); return (ETOOMANYREFS); } return (0); } /* * Add a source to a multicast filter list. * Assumes the associated inpcb is locked. */ static int imo_join_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) { struct in_msource *ims, *nims; struct in_mfilter *imf; KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters vector not allocated", __func__)); imf = &imo->imo_mfilters[gidx]; if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) return (ENOBUFS); ims = imo_match_source(imo, gidx, &src->sa); if (ims != NULL) return (EADDRNOTAVAIL); /* Do not sleep with inp lock held. */ MALLOC(nims, struct in_msource *, sizeof(struct in_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOBUFS); nims->ims_addr = src->ss; TAILQ_INSERT_TAIL(&imf->imf_sources, nims, ims_next); imf->imf_nsources++; return (0); } static int imo_leave_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) { struct in_msource *ims; struct in_mfilter *imf; KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters vector not allocated", __func__)); imf = &imo->imo_mfilters[gidx]; if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) return (ENOBUFS); ims = imo_match_source(imo, gidx, &src->sa); if (ims == NULL) return (EADDRNOTAVAIL); TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); FREE(ims, M_IPMSOURCE); imf->imf_nsources--; return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ size_t imo_match_group(struct ip_moptions *imo, struct ifnet *ifp, struct sockaddr *group) { sockunion_t *gsa; struct in_multi **pinm; int idx; int nmships; gsa = (sockunion_t *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; #if 0 printf("%s: trying ifp = %p, inaddr = %s ", __func__, ifp, inet_ntoa(gsa->sin.sin_addr)); printf("against %p, %s\n", (*pinm)->inm_ifp, inet_ntoa((*pinm)->inm_addr)); #endif if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && (*pinm)->inm_addr.s_addr == gsa->sin.sin_addr.s_addr) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find a multicast source entry for this imo which matches * the given group index for this socket, and source address. */ struct in_msource * imo_match_source(struct ip_moptions *imo, size_t gidx, struct sockaddr *src) { struct in_mfilter *imf; struct in_msource *ims, *pims; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); pims = NULL; imf = &imo->imo_mfilters[gidx]; TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { /* * Perform bitwise comparison of two IPv4 addresses. * TODO: Do the same for IPv6. * Do not use sa_equal() for this as it is not aware of * deeper structure in sockaddr_in or sockaddr_in6. */ if (((struct sockaddr_in *)&ims->ims_addr)->sin_addr.s_addr == ((struct sockaddr_in *)src)->sin_addr.s_addr) { pims = ims; break; } } return (pims); } /* * Join an IPv4 multicast group. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *inm; inm = NULL; IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); IN_LOOKUP_MULTI(*ap, ifp, inm); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); ++inm->inm_refcount; } else do { sockunion_t gsa; struct ifmultiaddr *ifma; struct in_multi *ninm; int error; memset(&gsa, 0, sizeof(gsa)); gsa.sin.sin_family = AF_INET; gsa.sin.sin_len = sizeof(struct sockaddr_in); gsa.sin.sin_addr = *ap; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. * If so, bump the refcount on the existing network-layer * group association and return it. */ error = if_addmulti(ifp, &gsa.sa, &ifma); if (error) break; if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || inm->inm_addr.s_addr != ap->s_addr) panic("%s: ifma is inconsistent", __func__); #endif ++inm->inm_refcount; break; } /* * A new membership is needed; construct it and * perform the IGMP join. */ ninm = malloc(sizeof(*ninm), M_IPMADDR, M_NOWAIT | M_ZERO); if (ninm == NULL) { if_delmulti_ifma(ifma); break; } ninm->inm_addr = *ap; ninm->inm_ifp = ifp; ninm->inm_ifma = ifma; ninm->inm_refcount = 1; ifma->ifma_protospec = ninm; LIST_INSERT_HEAD(&in_multihead, ninm, inm_link); igmp_joingroup(ninm); inm = ninm; } while (0); IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); return (inm); } /* * Leave an IPv4 multicast group. * It is OK to call this routine if the underlying ifnet went away. * * XXX: To deal with the ifp going away, we cheat; the link-layer code in net * will set ifma_ifp to NULL when the associated ifnet instance is detached * from the system. * * The only reason we need to violate layers and check ifma_ifp here at all * is because certain hardware drivers still require Giant to be held, * and it must always be taken before other locks. */ void in_delmulti(struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm != NULL, ("%s: inm is NULL", __func__)); KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IFF_LOCKGIANT(ifp); } IN_MULTI_LOCK(); in_delmulti_locked(inm); IN_MULTI_UNLOCK(); if (ifp != NULL) IFF_UNLOCKGIANT(ifp); } /* * Delete a multicast address record, with locks held. * * It is OK to call this routine if the ifp went away. * Assumes that caller holds the IN_MULTI lock, and that * Giant was taken before other locks if required by the hardware. */ void in_delmulti_locked(struct in_multi *inm) { struct ifmultiaddr *ifma; IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_refcount >= 1, ("%s: freeing freed inm", __func__)); if (--inm->inm_refcount == 0) { igmp_leavegroup(inm); ifma = inm->inm_ifma; #ifdef DIAGNOSTIC if (bootverbose) printf("%s: purging ifma %p\n", __func__, ifma); #endif KASSERT(ifma->ifma_protospec == inm, ("%s: ifma_protospec != inm", __func__)); ifma->ifma_protospec = NULL; LIST_REMOVE(inm, inm_link); free(inm, M_IPMADDR); if_delmulti_ifma(ifma); } } /* * Block or unblock an ASM/SSM multicast source on an inpcb. */ static int inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; size_t idx; int error; int block; ifp = NULL; error = 0; block = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (mreqs.imr_interface.s_addr != INADDR_ANY) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) block = 1; #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) block = 1; break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } /* XXX INET6 */ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; /* * SSM multicast truth table for block/unblock operations. * * Operation Filter Mode Entry exists? Action * * block exclude no add source to filter * unblock include no add source to filter * block include no EINVAL * unblock exclude no EINVAL * block exclude yes EADDRNOTAVAIL * unblock include yes EADDRNOTAVAIL * block include yes remove source from filter * unblock exclude yes remove source from filter * * FreeBSD does not explicitly distinguish between ASM and SSM * mode sockets; all sockets are assumed to have a filter list. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imf_fmode is %s\n", __func__, imf->imf_fmode == MCAST_INCLUDE ? "include" : "exclude"); } #endif ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: adding %s to filter list\n", __func__, inet_ntoa(ssa->sin.sin_addr)); } #endif error = imo_join_source(imo, idx, ssa); } if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { /* * If the socket is in inclusive mode: * the source is already blocked as it has no entry. * If the socket is in exclusive mode: * the source is already unblocked as it has no entry. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ims %p; %s already [un]blocked\n", __func__, ims, inet_ntoa(ssa->sin.sin_addr)); } #endif error = EINVAL; } } else { if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { /* * If the socket is in exclusive mode: * the source is already blocked as it has an entry. * If the socket is in inclusive mode: * the source is already unblocked as it has an entry. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ims %p; %s already [un]blocked\n", __func__, ims, inet_ntoa(ssa->sin.sin_addr)); } #endif error = EADDRNOTAVAIL; } if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: removing %s from filter list\n", __func__, inet_ntoa(ssa->sin.sin_addr)); } #endif error = imo_leave_source(imo, idx, ssa); } } out_locked: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); imo = (struct ip_moptions *)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = (struct in_multi **)malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = (struct in_mfilter *)malloc( sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_IPMSOURCE, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) { imfp[idx].imf_fmode = MCAST_EXCLUDE; imfp[idx].imf_nsources = 0; TAILQ_INIT(&imfp[idx].imf_sources); } imo->imo_mfilters = imfp; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_IPMSOURCE); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). */ void inp_freemoptions(struct ip_moptions *imo) { struct in_mfilter *imf; struct in_msource *ims, *tims; size_t idx, nmships; KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { in_delmulti(imo->imo_membership[idx]); if (imo->imo_mfilters != NULL) { imf = &imo->imo_mfilters[idx]; TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); FREE(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: did not free all imf_nsources", __func__)); } } if (imo->imo_mfilters != NULL) free(imo->imo_mfilters, M_IPMSOURCE); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct in_msource *ims; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); - INP_LOCK(inp); + INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; msfr.msfr_fmode = imf->imf_fmode; msfr.msfr_nsrcs = imf->imf_nsources; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * msfr.msfr_nsrcs is always set to the total number of filter * entries which the kernel currently has for this group. */ tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { /* * Make a copy of the source vector so that we do not * thrash the inpcb lock whilst copying it out. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ MALLOC(tss, struct sockaddr_storage *, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT); if (tss == NULL) { error = ENOBUFS; } else { ptss = tss; TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { memcpy(ptss++, &ims->ims_addr, sizeof(struct sockaddr_storage)); } } } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); FREE(tss, M_TEMP); } if (error) return (error); error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; - INP_LOCK(inp); + INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (imo->imo_multicast_addr.s_addr != INADDR_ANY) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; IFP_TO_IA(ifp, ia); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; } } } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; ifp = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Obtain ifp. If no interface address was provided, * use the interface of the route in the unicast FIB for * the given multicast destination; usually, this is the * default route. * If this lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * If all of these conditions fail, return EADDRNOTAVAIL, and * reject the IPv4 multicast join. */ if (mreqs.imr_interface.s_addr != INADDR_ANY) { INADDR_TO_IFP(mreqs.imr_interface, ifp); } else { struct route ro; ro.ro_rt = NULL; *(struct sockaddr_in *)&ro.ro_dst = gsa->sin; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt != NULL) { ifp = ro.ro_rt->rt_ifp; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); RTFREE(ro.ro_rt); } else { struct in_ifaddr *ia; struct ifnet *mfp = NULL; TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { mfp = ia->ia_ifp; if (!(mfp->if_flags & IFF_LOOPBACK) && (mfp->if_flags & IFF_MULTICAST)) { ifp = mfp; break; } } } } #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * XXX INET6 */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } /* * Obtain the ifp. */ if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); /* * Check if we already hold membership of this group for this inpcb. * If so, we do not need to perform the initial join. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx != -1) { if (ssa->ss.ss_family != AF_UNSPEC) { /* * Attempting to join an ASM group (when already * an ASM or SSM member) is an error. */ error = EADDRNOTAVAIL; } else { imf = &imo->imo_mfilters[idx]; if (imf->imf_nsources == 0) { /* * Attempting to join an SSM group (when * already an ASM member) is an error. */ error = EINVAL; } else { /* * Attempting to join an SSM group (when * already an SSM member) means "add this * source to the inclusive filter list". */ error = imo_join_source(imo, idx, ssa); } } goto out_locked; } /* * Call imo_grow() to reallocate the membership and source filter * vectors if they are full. If the size would exceed the hard limit, * then we know we've really run out of entries. We keep the INP * lock held to avoid introducing a race condition. */ if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_locked; } /* * So far, so good: perform the layer 3 join, layer 2 join, * and make an IGMP announcement if needed. */ inm = in_addmulti(&gsa->sin.sin_addr, ifp); if (inm == NULL) { error = ENOBUFS; goto out_locked; } idx = imo->imo_num_memberships; imo->imo_membership[idx] = inm; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(TAILQ_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); /* * If this is a new SSM group join (i.e. a source was specified * with this group), add this source to the filter list. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* * An initial SSM join implies that this socket's membership * of the multicast group is now in inclusive mode. */ imf->imf_fmode = MCAST_INCLUDE; error = imo_join_source(imo, idx, ssa); if (error) { /* * Drop inp lock before calling in_delmulti(), * to prevent a lock order reversal. */ --imo->imo_num_memberships; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); in_delmulti(inm); return (error); } } out_locked: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims, *tims; struct in_multi *inm; size_t idx; int error; ifp = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (gsa->sin.sin_addr.s_addr != INADDR_ANY) INADDR_TO_IFP(mreqs.imr_interface, ifp); #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_locked; } imf = &imo->imo_mfilters[idx]; /* * If we were instructed only to leave a given source, do so. */ if (ssa->ss.ss_family != AF_UNSPEC) { if (imf->imf_nsources == 0 || imf->imf_fmode == MCAST_EXCLUDE) { /* * Attempting to SSM leave an ASM group * is an error; should use *_BLOCK_SOURCE instead. * Attempting to SSM leave a source in a group when * the socket is in 'exclude mode' is also an error. */ error = EINVAL; } else { error = imo_leave_source(imo, idx, ssa); } /* * If an error occurred, or this source is not the last * source in the group, do not leave the whole group. */ if (error || imf->imf_nsources > 0) goto out_locked; } /* * Give up the multicast address record to which the membership points. */ inm = imo->imo_membership[idx]; in_delmulti(inm); /* * Free any source filters for this group if they exist. * Revert inpcb to the default MCAST_EXCLUDE state. */ if (imo->imo_mfilters != NULL) { TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); FREE(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: imf_nsources not 0", __func__)); KASSERT(TAILQ_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); imf->imf_fmode = MCAST_EXCLUDE; } /* * Remove the gap in the membership array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_num_memberships--; out_locked: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (addr.s_addr == INADDR_ANY) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ifp = %p, addr = %s\n", __func__, ifp, inet_ntoa(addr)); /* XXX INET6 */ } #endif } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims, *tims; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > IP_MAX_SOURCE_FILTER || (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_locked; } imf = &imo->imo_mfilters[idx]; #ifdef DIAGNOSTIC if (bootverbose) printf("%s: clearing source list\n", __func__); #endif /* * Remove any existing source filters. */ TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); FREE(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: source list not cleared", __func__)); /* * Apply any new source filters, if present. */ if (msfr.msfr_nsrcs > 0) { struct in_msource **pnims; struct in_msource *nims; struct sockaddr_storage *kss; struct sockaddr_storage *pkss; sockunion_t *psu; int i, j; /* * Drop the inp lock so we may sleep if we need to * in order to satisfy a malloc request. * We will re-take it before changing socket state. */ - INP_UNLOCK(inp); + INP_WUNLOCK(inp); #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: loading %lu source list entries\n", __func__, (unsigned long)msfr.msfr_nsrcs); } #endif /* * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ MALLOC(kss, struct sockaddr_storage *, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { FREE(kss, M_TEMP); return (error); } /* * Perform argument checking on every sockaddr_storage * structure in the vector provided to us. Overwrite * fields which should not apply to source entries. * TODO: Check for duplicate sources on this pass. */ psu = (sockunion_t *)kss; for (i = 0; i < msfr.msfr_nsrcs; i++, psu++) { switch (psu->ss.ss_family) { case AF_INET: if (psu->sin.sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; } else { psu->sin.sin_port = 0; } break; #ifdef notyet case AF_INET6; if (psu->sin6.sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; } else { psu->sin6.sin6_port = 0; psu->sin6.sin6_flowinfo = 0; } break; #endif default: error = EAFNOSUPPORT; break; } if (error) break; } if (error) { FREE(kss, M_TEMP); return (error); } /* * Allocate a block to track all the in_msource * entries we are about to allocate, in case we * abruptly need to free them. */ MALLOC(pnims, struct in_msource **, sizeof(struct in_msource *) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK | M_ZERO); /* * Allocate up to nsrcs individual chunks. * If we encounter an error, backtrack out of * all allocations cleanly; updates must be atomic. */ pkss = kss; nims = NULL; for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { MALLOC(nims, struct in_msource *, sizeof(struct in_msource) * msfr.msfr_nsrcs, M_IPMSOURCE, M_WAITOK | M_ZERO); pnims[i] = nims; } if (i < msfr.msfr_nsrcs) { for (j = 0; j < i; j++) { if (pnims[j] != NULL) FREE(pnims[j], M_IPMSOURCE); } FREE(pnims, M_TEMP); FREE(kss, M_TEMP); return (ENOBUFS); } INP_UNLOCK_ASSERT(inp); /* * Finally, apply the filters to the socket. * Re-take the inp lock; we are changing socket state. */ pkss = kss; - INP_LOCK(inp); + INP_WLOCK(inp); for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { memcpy(&(pnims[i]->ims_addr), pkss, sizeof(struct sockaddr_storage)); TAILQ_INSERT_TAIL(&imf->imf_sources, pnims[i], ims_next); imf->imf_nsources++; } FREE(pnims, M_TEMP); FREE(kss, M_TEMP); } /* * Update the filter mode on the socket before releasing the inpcb. */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); imf->imf_fmode = msfr.msfr_fmode; out_locked: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. * XXX Unlocked read of inp_socket believed OK. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_change_source_filter(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 178284) +++ head/sys/netinet/in_pcb.c (revision 178285) @@ -1,1493 +1,1502 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ int ipport_reservedlow = 0; /* Variables dealing with random ephemeral port allocation. */ int ipport_randomized = 1; /* user controlled via sysctl */ int ipport_randomcps = 10; /* user controlled via sysctl */ int ipport_randomtime = 45; /* user controlled via sysctl */ int ipport_stoprandom = 0; /* toggled by ipport_tick */ int ipport_tcpallocs; int ipport_tcplastcount; #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (error == 0) { RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, &ipport_randomized, 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, &ipport_randomcps, 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, &ipport_randomtime, 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; SOCK_LOCK(so); mac_inpcb_create(so, inp); SOCK_UNLOCK(so); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - INP_LOCK(inp); + INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; #if defined(IPSEC) || defined(MAC) out: if (error != 0) uma_zfree(pcbinfo->ipi_zone, inp); #endif return (error); } int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = inp->inp_lport == 0 && (nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0); error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; unsigned short *lastport; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error, prison = 0; int dorandom; - INP_INFO_WLOCK_ASSERT(pcbinfo); + /* + * Because no actual state changes occur here, a write global write + * lock on the pcbinfo isn't required. + */ + INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = INPLOOKUP_WILDCARD; if (nam) { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif if (sin->sin_addr.s_addr != INADDR_ANY) if (prison_ip(cred, 0, &sin->sin_addr.s_addr)) return(EINVAL); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= ipport_reservedhigh && ntohs(lport) >= ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (jailed(cred)) prison = 1; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(so->so_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(inp->inp_pcbinfo, sin->sin_addr, lport, prison ? 0 : INPLOOKUP_WILDCARD); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((t->inp_vflag & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && (so->so_cred->cr_uid != t->inp_socket->so_cred->cr_uid)) return (EADDRINUSE); } if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr)) return (EADDRNOTAVAIL); t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, prison ? 0 : wild); if (t && (t->inp_vflag & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(inp); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & t->inp_socket->so_options) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket)) #endif return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { u_short first, last, aux; int count; if (laddr.s_addr != INADDR_ANY) if (prison_ip(cred, 0, &laddr.s_addr)) return (EINVAL); if (inp->inp_flags & INP_HIGHPORT) { first = ipport_hifirstauto; /* sysctl */ last = ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; first = ipport_lowfirstauto; /* 1023 */ last = ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = ipport_firstauto; /* sysctl */ last = ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP, use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (ipport_randomized && (!ipport_stoprandom || pcbinfo == &udbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP packets in the count. */ if (pcbinfo != &udbinfo) ipport_tcpallocs++; /* * Simple check to ensure all ports are not used up causing * a deadlock here. */ if (first > last) { aux = first; first = last; last = aux; } if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in_pcblookup_local(pcbinfo, laddr, lport, wild)); } if (prison_ip(cred, 0, &laddr.s_addr)) return (EINVAL); *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash(inp); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct sockaddr_in sa; struct ucred *socred; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + /* + * Because a global state change doesn't actually occur here, a read + * lock is sufficient. + */ + INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); INP_LOCK_ASSERT(inp); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; socred = inp->inp_socket->so_cred; if (laddr.s_addr == INADDR_ANY && jailed(socred)) { bzero(&sa, sizeof(sa)); sa.sin_addr.s_addr = htonl(prison_getip(socred)); sa.sin_len = sizeof(sa); sa.sin_family = AF_INET; error = in_pcbbind_setup(inp, (struct sockaddr *)&sa, &laddr.s_addr, &lport, cred); if (error) return (error); } if (!TAILQ_EMPTY(&in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; else if (faddr.s_addr == (u_long)INADDR_BROADCAST && (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) faddr = satosin(&TAILQ_FIRST( &in_ifaddrhead)->ia_broadaddr)->sin_addr; } if (laddr.s_addr == INADDR_ANY) { ia = (struct in_ifaddr *)0; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) ia = ip_rtaddr(faddr); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (ia == 0) { bzero(&sa, sizeof(sa)); sa.sin_addr = faddr; sa.sin_len = sizeof(sa); sa.sin_family = AF_INET; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa))); if (ia == 0) ia = ifatoia(ifa_ifwithnet(sintosa(&sa))); if (ia == 0) return (ENETUNREACH); } /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, use the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) if (ia->ia_ifp == ifp) break; if (ia == 0) return (EADDRNOTAVAIL); } } laddr = ia->ia_addr.sin_addr; } oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } /* * In the old world order, in_pcbdetach() served two functions: to detach the * pcb from the socket/potentially free the socket, and to free the pcb * itself. In the new world order, the protocol code is responsible for * managing the relationship with the socket, and this code simply frees the * pcb. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL")); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *ipi = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL")); + INP_INFO_WLOCK_ASSERT(ipi); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef IPSEC ipsec4_delete_pcbpolicy(inp); #endif /*IPSEC*/ inp->inp_gencnt = ++ipi->ipi_gencnt; in_pcbremlists(inp); if (inp->inp_options) (void)m_free(inp->inp_options); if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; #ifdef MAC mac_inpcb_destroy(inp); #endif - INP_UNLOCK(inp); + INP_WUNLOCK(inp); uma_zfree(ipi->ipi_zone, inp); } /* * TCP needs to maintain its inpcb structure after the TCP connection has * been torn down. However, it must be disconnected from the inpcb hashes as * it must not prevent binding of future connections to the same port/ip * combination by other inpcbs. */ void in_pcbdrop(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); inp->inp_vflag |= INP_DROPPED; if (inp->inp_lport) { struct inpcbport *phd = inp->inp_phd; LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } inp->inp_lport = 0; } } /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { - INP_LOCK(inp); + INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { - INP_LOCK(inp); + INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_int lport_arg, int wild_okay) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; u_short lport = lport_arg; - INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_INFO_LOCK_ASSERT(pcbinfo); if (!wild_okay) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found. */ return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) { break; } } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST /* * Lookup PCB in hash list. */ struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; - INP_INFO_RLOCK_ASSERT(pcbinfo); + INP_INFO_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) return (inp); } /* * Then look for a wildcard match, if requested. */ if (wildcard) { struct inpcb *local_wild = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; if (inp->inp_laddr.s_addr == laddr.s_addr) return (inp); else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 if (INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) local_wild_mapped = inp; else #endif local_wild = inp; } } } #ifdef INET6 if (local_wild == NULL) return (local_wild_mapped); #endif return (local_wild); } return (NULL); } /* * Insert PCB onto various hash lists. */ int in_pcbinshash(struct inpcb *inp) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_INFO_WLOCK_ASSERT(pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; else #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); return (0); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_INFO_WLOCK_ASSERT(pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; else #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } /* * Remove PCB from various lists. */ void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_lport) { struct inpcbport *phd = inp->inp_phd; LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ void ipport_tick(void *xtp) { if (ipport_tcpallocs <= ipport_tcplastcount + ipport_randomcps) { if (ipport_stoprandom > 0) ipport_stoprandom--; } else ipport_stoprandom = ipport_randomtime; ipport_tcplastcount = ipport_tcpallocs; callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } void inp_wlock(struct inpcb *inp) { - INP_LOCK(inp); + INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { - INP_LOCK(inp); + INP_WLOCK(inp); } void inp_runlock(struct inpcb *inp) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags == 1) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else { #endif /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); #ifdef INET6 } #endif db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_FAITH) { db_printf("%sINP_FAITH", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } } void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h (revision 178284) +++ head/sys/netinet/in_pcb.h (revision 178285) @@ -1,436 +1,446 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include +#include #include +#ifdef _KERNEL +#include +#endif + #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ struct inpcbpolicy; /* * Struct inpcb is the ommon structure pcb for the Internet Protocol * implementation. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union { /* foreign host table entry */ struct in_addr_4in6 ie46_foreign; struct in6_addr ie6_foreign; } ie_dependfaddr; union { /* local host table entry */ struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 #define ie6_faddr ie_dependfaddr.ie6_foreign #define ie6_laddr ie_dependladdr.ie6_local }; /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_pad; /* XXX alignment for in_endpoints */ /* protocol dependent part */ struct in_endpoints inc_ie; }; #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr struct icmp6_filter; struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ void *inp_ppcb; /* pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* PCB list info */ struct socket *inp_socket; /* back pointer to socket */ u_int32_t inp_flow; int inp_flags; /* generic IP/datagram flags */ u_char inp_vflag; /* IP version flag (v4/v6) */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ #define INP_TIMEWAIT 0x8 /* .. probably doesn't go here */ #define INP_ONESBCAST 0x10 /* send all-ones broadcast */ #define INP_DROPPED 0x20 /* protocol drop flag */ #define INP_SOCKREF 0x40 /* strong socket reference */ u_char inp_ip_ttl; /* time to live proto */ u_char inp_ip_p; /* protocol proto */ u_char inp_ip_minttl; /* minimum TTL or drop */ uint32_t inp_ispare1; /* connection id / queue id */ void *inp_pspare[2]; /* rtentry / general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* list for this PCB's local port */ struct label *inp_label; /* MAC label */ struct inpcbpolicy *inp_sp; /* for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp4_ip_tos; /* type of service proto */ struct mbuf *inp4_options; /* IP options */ struct ip_moptions *inp4_moptions; /* IP multicast options */ } inp_depend4; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions struct { /* IP options */ struct mbuf *inp6_options; /* IP6 options for outgoing packets */ struct ip6_pktopts *inp6_outputopts; /* IP multicast options */ struct ip6_moptions *inp6_moptions; /* ICMPv6 code type filter */ struct icmp6_filter *inp6_icmp6filt; /* IPV6_CHECKSUM setsockopt */ int inp6_cksum; short inp6_hops; } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; struct inpcbport *inp_phd; /* head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* generation count of this instance */ - struct mtx inp_mtx; + struct rwlock inp_lock; #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_ip6_nxt inp_ip_p #define in6p_flowinfo inp_flow #define in6p_vflag inp_vflag #define in6p_options inp_depend6.inp6_options #define in6p_outputopts inp_depend6.inp6_outputopts #define in6p_moptions inp_depend6.inp6_moptions #define in6p_icmp6filt inp_depend6.inp6_icmp6filt #define in6p_cksum inp_depend6.inp6_cksum #define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ #define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ }; /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct inpcb xi_inp; struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; /* * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. */ struct inpcbinfo { /* * Global list of inpcbs on the protocol. */ struct inpcbhead *ipi_listhead; u_int ipi_count; /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ struct inpcbhead *ipi_hashbase; u_long ipi_hashmask; /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; u_long ipi_porthashmask; /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; u_short ipi_lastlow; u_short ipi_lasthi; /* * UMA zone from which inpcbs are allocated for this protocol. */ struct uma_zone *ipi_zone; /* * Generation count--incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; - struct mtx ipi_mtx; + struct rwlock ipi_lock; /* * vimage 1 * general use 1 */ void *ipi_pspare[2]; }; #define INP_LOCK_INIT(inp, d, t) \ - mtx_init(&(inp)->inp_mtx, (d), (t), MTX_DEF | MTX_RECURSE | MTX_DUPOK) -#define INP_LOCK_DESTROY(inp) mtx_destroy(&(inp)->inp_mtx) -#define INP_LOCK(inp) mtx_lock(&(inp)->inp_mtx) -#define INP_UNLOCK(inp) mtx_unlock(&(inp)->inp_mtx) -#define INP_LOCK_ASSERT(inp) mtx_assert(&(inp)->inp_mtx, MA_OWNED) -#define INP_UNLOCK_ASSERT(inp) mtx_assert(&(inp)->inp_mtx, MA_NOTOWNED) + rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) +#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) +#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) +#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) +#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) +#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) +#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) +#define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) +#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) +#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) #ifdef _KERNEL /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANTS void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else static __inline void inp_lock_assert(struct inpcb *inp __unused) { } static __inline void inp_unlock_assert(struct inpcb *inp __unused) { } #endif #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ - mtx_init(&(ipi)->ipi_mtx, (d), NULL, MTX_DEF | MTX_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_mtx) -#define INP_INFO_RLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) -#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) -#define INP_INFO_RUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) -#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) -#define INP_INFO_RLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_mtx, MA_OWNED) -#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_mtx, MA_OWNED) -#define INP_INFO_UNLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_mtx, MA_NOTOWNED) + rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) +#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) +#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) +#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) +#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) +#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) +#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) +#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) +#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) /* flags in inp_flags: */ #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x04 /* receive IP dst address */ #define INP_HDRINCL 0x08 /* user supplies entire IP header */ #define INP_HIGHPORT 0x10 /* user wants "high" port binding */ #define INP_LOWPORT 0x20 /* user wants "low" port binding */ #define INP_ANONPORT 0x40 /* port chosen for user */ #define INP_RECVIF 0x80 /* receive incoming interface */ #define INP_MTUDISC 0x100 /* user can do MTU discovery */ #define INP_FAITH 0x200 /* accept FAITH'ed connections */ #define INP_RECVTTL 0x400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x800 /* don't fragment packet */ #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL) /* for KAME src sync over BSD*'s */ #define IN6P_HIGHPORT INP_HIGHPORT #define IN6P_LOWPORT INP_LOWPORT #define IN6P_ANONPORT INP_ANONPORT #define IN6P_RECVIF INP_RECVIF #define IN6P_MTUDISC INP_MTUDISC #define IN6P_FAITH INP_FAITH #define IN6P_CONTROLOPTS INP_CONTROLOPTS /* * socket AF version is {newer than,or include} * actual datagram AF version */ #define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL extern int ipport_reservedhigh; extern int ipport_reservedlow; extern int ipport_lowfirstauto; extern int ipport_lowlastauto; extern int ipport_firstauto; extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); struct inpcb * in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbrehash(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); void in_pcbremlists(struct inpcb *inp); void ipport_tick(void *xtp); /* * Debugging routines compiled in when DDB is present. */ void db_print_inpcb(struct inpcb *inp, const char *name, int indent); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 178284) +++ head/sys/netinet/ip_divert.c (revision 178285) @@ -1,756 +1,756 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if !defined(KLD_MODULE) #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_mac.h" #ifndef INET #error "IPDIVERT requires INET." #endif #ifndef IPFIREWALL #error "IPDIVERT requires IPFIREWALL" #endif #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw, see the divert(4) * manpage for features. * Internally, packets selected by ipfw in ip_input() or ip_output(), * and never diverted before, are passed to the input queue of the * divert socket with a given 'divert_port' number (as specified in * the matching ipfw rule), and they are tagged with a 16 bit cookie * (representing the rule number of the matching ipfw rule), which * is passed to process reading from the socket. * * Packets written to the divert socket are again tagged with a cookie * (usually the same as above) and a destination address. * If the destination address is INADDR_ANY then the packet is * treated as outgoing and sent to ip_output(), otherwise it is * treated as incoming and sent to ip_input(). * In both cases, the packet is tagged with the cookie. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * ipfw processing will start at the rule number after the one * written in the cookie (so, tagging a packet with a cookie of 0 * will cause it to be effectively considered as a standard packet). */ /* Internal variables. */ static struct inpcbhead divcb; static struct inpcbinfo divcbinfo; static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* * Initialize divert connection block queue. */ static void div_zone_change(void *tag) { uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); } static int div_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "divinp"); return (0); } static void div_inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } void div_init(void) { INP_INFO_LOCK_INIT(&divcbinfo, "div"); LIST_INIT(&divcb); divcbinfo.ipi_listhead = &divcb; /* * XXX We don't use the hash list for divert IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &divcbinfo.ipi_hashmask); divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &divcbinfo.ipi_porthashmask); divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, NULL, EVENTHANDLER_PRI_ANY); } /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ void div_input(struct mbuf *m, int off) { ipstat.ips_noproto++; m_freem(m); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ static void divert_packet(struct mbuf *m, int incoming) { struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (mtag == NULL) { printf("%s: no divert tag\n", __func__); m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == 0) return; ip = mtod(m, struct ip *); /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { ip->ip_len = ntohs(ip->ip_len); in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; ip->ip_len = htons(ip->ip_len); } /* * Record receive interface address, if any. * But only for incoming packets. */ bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ if (incoming) { struct ifaddr *ifa; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)divert_info(mtag)); INP_INFO_RLOCK(&divcbinfo); LIST_FOREACH(inp, &divcb, inp_list) { - INP_LOCK(inp); + INP_WLOCK(inp); /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { sa = inp->inp_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, (struct mbuf *)0) == 0) { SOCKBUF_UNLOCK(&sa->so_rcv); sa = NULL; /* force mbuf reclaim below */ } else sorwakeup_locked(sa); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&divcbinfo); if (sa == NULL) { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { struct m_tag *mtag; struct divert_tag *dt; int error = 0; struct mbuf *options; /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; if (control) m_freem(control); /* XXX */ if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) { mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT | M_ZERO); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } dt = (struct divert_tag *)(mtag+1); m_tag_prepend(m, mtag); } else dt = (struct divert_tag *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; dt->cookie = sin->sin_port; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct ip *const ip = mtod(m, struct ip *); struct inpcb *inp; dt->info |= IP_FW_DIVERT_OUTPUT_FLAG; INP_INFO_WLOCK(&divcbinfo); inp = sotoinpcb(so); - INP_LOCK(inp); + INP_WLOCK(inp); /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); m_freem(m); } else { /* Convert fields to host order for ip_output() */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); /* Send packet to output processing */ ipstat.ips_rawout++; /* XXX */ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Get ready to inject the packet into ip_output(). * Just in case socket options were specified on the * divert socket, we duplicate them. This is done * to avoid having to hold the PCB locks over the call * to ip_output(), as doing this results in a number of * lock ordering complexities. * * Note that we set the multicast options argument for * ip_output() to NULL since it should be invariant that * they are not present. */ KASSERT(inp->inp_moptions == NULL, ("multicast options set on a divert socket")); options = NULL; /* * XXXCSJP: It is unclear to me whether or not it makes * sense for divert sockets to have options. However, * for now we will duplicate them with the INP locks * held so we can use them in ip_output() without * requring a reference to the pcb. */ if (inp->inp_options != NULL) { options = m_dup(inp->inp_options, M_DONTWAIT); if (options == NULL) error = ENOBUFS; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); if (error == ENOBUFS) { m_freem(m); return (error); } error = ip_output(m, options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); if (options != NULL) m_freem(options); } } else { dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC SOCK_LOCK(so); mac_socket_create_mbuf(so, m); SOCK_UNLOCK(so); #endif /* Send packet to input processing via netisr */ netisr_queue(NETISR_IP, m); } return error; cantsend: m_freem(m); return error; } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("div_attach: inp != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; INP_INFO_WLOCK(&divcbinfo); error = in_pcballoc(so, &divcbinfo); if (error) { INP_INFO_WUNLOCK(&divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static void div_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&divcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&divcbinfo); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_bind: inp == NULL")); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) return EAFNOSUPPORT; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&divcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); socantsendmore(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { ipstat.ips_toosmall++; m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) return; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = divcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&divcbinfo); gencnt = divcbinfo.ipi_gencnt; n = divcbinfo.ipi_count; INP_INFO_RUNLOCK(&divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&divcbinfo); for (inp = LIST_FIRST(divcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&divcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&divcbinfo); xig.xig_gen = divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = divcbinfo.ipi_count; INP_INFO_RUNLOCK(&divcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif struct pr_usrreqs div_usrreqs = { .pru_attach = div_attach, .pru_bind = div_bind, .pru_control = in_control, .pru_detach = div_detach, .pru_peeraddr = in_getpeeraddr, .pru_send = div_send, .pru_shutdown = div_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel }; struct protosw div_protosw = { .pr_type = SOCK_RAW, .pr_protocol = IPPROTO_DIVERT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = div_input, .pr_ctlinput = div_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = div_init, .pr_usrreqs = &div_usrreqs }; static int div_modevent(module_t mod, int type, void *unused) { int err = 0; int n; switch (type) { case MOD_LOAD: /* * Protocol will be initialized by pf_proto_register(). * We don't have to register ip_protox because we are not * a true IP protocol that goes over the wire. */ err = pf_proto_register(PF_INET, &div_protosw); ip_divert_ptr = divert_packet; break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. */ INP_INFO_WLOCK(&divcbinfo); n = divcbinfo.ipi_count; if (n != 0) { err = EBUSY; INP_INFO_WUNLOCK(&divcbinfo); break; } ip_divert_ptr = NULL; err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&divcbinfo); INP_INFO_LOCK_DESTROY(&divcbinfo); uma_zdestroy(divcbinfo.ipi_zone); break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(ipdivert, 1); Index: head/sys/netinet/ip_fw2.c =================================================================== --- head/sys/netinet/ip_fw2.c (revision 178284) +++ head/sys/netinet/ip_fw2.c (revision 178285) @@ -1,4568 +1,4568 @@ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define DEB(x) #define DDB(x) x /* * Implement IP packet firewall (new version) */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include /* XXX for ETHERTYPE_IP */ #include /* XXX for in_cksum */ #include /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 /* * Data structure to cache our ucred related * information. This structure only gets used if * the user specified UID/GID based constraints in * a firewall rule. */ struct ip_fw_ugid { gid_t fw_groups[NGROUPS]; int fw_ngroups; uid_t fw_uid; int fw_prid; }; /* * list of rules for layer 3 */ struct ip_fw_chain layer3_chain; MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); #define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) ipfw_nat_t *ipfw_nat_ptr = NULL; ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; static int fw_debug = 1; static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, ipfw_chg_hook, "I", "Enable ipfw"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_verbose, 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ static ipfw_dyn_rule **ipfw_dyn_v = NULL; static u_int32_t dyn_buckets = 256; /* must be power of 2 */ static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) /* * Timeouts for various events in handing dynamic rules. */ static u_int32_t dyn_ack_lifetime = 300; static u_int32_t dyn_syn_lifetime = 20; static u_int32_t dyn_fin_lifetime = 1; static u_int32_t dyn_rst_lifetime = 1; static u_int32_t dyn_udp_lifetime = 10; static u_int32_t dyn_short_lifetime = 5; /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static u_int32_t dyn_keepalive_interval = 20; static u_int32_t dyn_keepalive_period = 5; static u_int32_t dyn_keepalive = 1; /* do send keepalives */ static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &dyn_buckets, 0, "Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &dyn_count, 0, "Number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &dyn_max, 0, "Max number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &static_count, 0, "Number of static rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); #ifdef INET6 /* * IPv6 specific variables */ SYSCTL_DECL(_net_inet6_ip6); static struct sysctl_ctx_list ip6_fw_sysctl_ctx; static struct sysctl_oid *ip6_fw_sysctl_tree; #endif /* INET6 */ #endif /* SYSCTL_NODE */ static int fw_deny_unknown_exthdrs = 1; /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { struct ifaddr *ia; /* XXX lock? */ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return(1); /* match */ } } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. The 'versrcreach' * option just checks that the source address is reachable via any route * (except default) in the routing table. These two are a measure to block * forged packets. This is also commonly known as "anti-spoofing" or Unicast * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that syntax is * misleading. The check may be performed on all IP packets whether unicast, * multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp) { struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &ifnet, if_link) TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) return 1; } } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; rtalloc_ign((struct route *)&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port); return i; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp_seq ack, seq; int flags; struct { struct ip6_hdr ip6; struct tcphdr th; } ti; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) != 0) { m_freem(m); args->m = NULL; return; } ti.ip6 = *ip6; ti.th = *tcp; ti.th.th_seq = ntohl(ti.th.th_seq); ti.th.th_ack = ntohl(ti.th.th_ack); ti.ip6.ip6_nxt = IPPROTO_TCP; if (ti.th.th_flags & TH_ACK) { ack = 0; seq = ti.th.th_ack; flags = TH_RST; } else { ack = ti.th.th_seq; if ((m->m_flags & M_PKTHDR) != 0) { /* * total new data to ACK is: * total packet length, * minus the header length, * minus the tcp header length. */ ack += m->m_pkthdr.len - hlen - (ti.th.th_off << 2); } else if (ip6->ip6_plen) { ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) - hlen - (ti.th.th_off << 2); } else { m_freem(m); return; } if (tcp->th_flags & TH_SYN) ack++; seq = 0; flags = TH_RST|TH_ACK; } bcopy(&ti, ip6, sizeof(ti)); /* * m is only used to recycle the mbuf * The data in it is never read so we don't need * to correct the offsets or anything */ tcp_respond(NULL, ip6, tcp, m, ack, seq, flags); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else m_freem(m); args->m = NULL; } #endif /* INET6 */ static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ static void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { struct ether_header *eh = args->eh; char *action; int limit_reached = 0; char action2[40], proto[128], fragment[32]; fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (verbose_limit != 0 && norule_counter >= verbose_limit) return; norule_counter++; if (norule_counter == verbose_limit) limit_reached = verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(dummyaddr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; char src[48], dst[48]; struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntoa_r(ip->ip_src, src); inet_ntoa_r(ip->ip_dst, dst); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.frag_id6, ntohs(ip6->ip6_plen) - hlen, ntohs(offset & IP6F_OFF_MASK) << 3, (offset & IP6F_MORE_FRAG) ? "+" : ""); } else #endif { int ip_off, ip_len; if (eh != NULL) { /* layer 2 packets are as on the wire */ ip_off = ntohs(ip->ip_off); ip_len = ntohs(ip->ip_len); } else { ip_off = ip->ip_off; ip_len = ip->ip_len; } if (ip_off & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), offset << 3, (ip_off & IP_MF) ? "+" : ""); } } if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (curr_dyn_buckets - 1); return i; } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) return; last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < curr_dyn_buckets ; i++) { for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } /** * lookup a dynamic rule. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && q->dyn_type != O_LIMIT_PARENT) { if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6)) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.dst_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.src_ip6)) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ q->expire = time_uptime + dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } q->expire = time_uptime + dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (dyn_fin_lifetime >= dyn_keepalive_period) dyn_fin_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (dyn_rst_lifetime >= dyn_keepalive_period) dyn_rst_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } static ipfw_dyn_rule * lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (dyn_buckets > 65536) dyn_buckets = 1024; if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ dyn_buckets = curr_dyn_buckets; /* reset */ return; } curr_dyn_buckets = dyn_buckets; if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); for (;;) { ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) break; curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { realloc_dynamic_table(); if (ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = r; dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), dyn_count ); ) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ static int install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { static int last_log; ipfw_dyn_rule *q; struct in_addr da; char src[48], dst[48]; src[0] = '\0'; dst[0] = '\0'; DEB( printf("ipfw: %s: type %d 0x%08x %u -> 0x%08x %u\n", __func__, cmd->o.opcode, (args->f_id.src_ip), (args->f_id.src_port), (args->f_id.dst_ip), (args->f_id.dst_port)); ) IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); } IPFW_DYN_UNLOCK(); return (0); } if (dyn_count >= dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (dyn_count >= dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); } IPFW_DYN_UNLOCK(); return (1); /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: { /* limit number of sessions */ struct ipfw_flow_id id; ipfw_dyn_rule *parent; uint32_t conn_limit; uint16_t limit_mask = cmd->limit_mask; conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? tablearg : cmd->conn_limit; DEB( if (cmd->conn_limit == IP_FW_TABLEARG) printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " "(tablearg)\n", __func__, conn_limit); else printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", __func__, conn_limit); ) id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; id.addr_type = args->f_id.addr_type; if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { printf("ipfw: %s: add parent failed\n", __func__); IPFW_DYN_UNLOCK(); return (1); } if (parent->count >= conn_limit) { /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { if (fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* * XXX IPv6 flows are not * supported yet. */ if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(args->f_id.dst_ip); inet_ntoa_r(da, dst); } log(LOG_SECURITY | LOG_DEBUG, "ipfw: %d %s %s:%u -> %s:%u, %s\n", parent->rule->rulenum, "drop session", src, (args->f_id.src_port), dst, (args->f_id.dst_port), "too many entries"); } IPFW_DYN_UNLOCK(); return (1); } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); break; } default: printf("ipfw: %s: unknown dynamic rule type %u\n", __func__, cmd->o.opcode); IPFW_DYN_UNLOCK(); return (1); } /* XXX just set lifetime */ lookup_dyn_rule_locked(&args->f_id, NULL, NULL); IPFW_DYN_UNLOCK(); return (0); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ static struct mbuf * send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; struct tcphdr *tcp; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == 0) return (NULL); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); m->m_data += max_linkhdr; ip = mtod(m, struct ip *); bzero(ip, m->m_len); tcp = (struct tcphdr *)(ip + 1); /* no IP options */ ip->ip_p = IPPROTO_TCP; tcp->th_off = 5; /* * Assume we are sending a RST (or a keepalive in the reverse * direction), swap src and destination addresses and ports. */ ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); tcp->th_sport = htons(id->dst_port); tcp->th_dport = htons(id->src_port); if (flags & TH_RST) { /* we are sending a RST */ if (flags & TH_ACK) { tcp->th_seq = htonl(ack); tcp->th_ack = htonl(0); tcp->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; tcp->th_seq = htonl(0); tcp->th_ack = htonl(seq); tcp->th_flags = TH_RST | TH_ACK; } } else { /* * We are sending a keepalive. flags & TH_SYN determines * the direction, forward if set, reverse if clear. * NOTE: seq and ack are always assumed to be correct * as set by the caller. This may be confusing... */ if (flags & TH_SYN) { /* * we have to rewrite the correct addresses! */ ip->ip_dst.s_addr = htonl(id->dst_ip); ip->ip_src.s_addr = htonl(id->src_ip); tcp->th_dport = htons(id->dst_port); tcp->th_sport = htons(id->src_port); } tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; } /* * set ip_len to the payload size so we can compute * the tcp checksum on the pseudoheader * XXX check this, could save a couple of words ? */ ip->ip_len = htons(sizeof(struct tcphdr)); tcp->th_sum = in_cksum(m, m->m_pkthdr.len); /* * now fill fields left out earlier */ ip->ip_ttl = ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); } /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ if (args->eh != NULL) { ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } m_freem(args->m); } else m_freem(args->m); args->m = NULL; } /** * * Given an ip_fw *, lookup_next_rule will return a pointer * to the next rule, which can be either the jump * target (for skipto instructions) or the next one in the list (in * all other cases including a missing jump target). * The result is also written in the "next_rule" field of the rule. * Backward jumps are not allowed, so start looking from the next * rule... * * This never returns NULL -- in case we do not have an exact match, * the next rule is returned. When the ruleset is changed, * pointers are flushed so we are always correct. */ static struct ip_fw * lookup_next_rule(struct ip_fw *me) { struct ip_fw *rule = NULL; ipfw_insn *cmd; /* look for action, in case it is a skipto */ cmd = ACTION_PTR(me); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); if (cmd->opcode == O_ALTQ) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); if ( cmd->opcode == O_SKIPTO ) for (rule = me->next; rule ; rule = rule->next) if (rule->rulenum >= cmd->arg1) break; if (rule == NULL) /* failure or not a skipto */ rule = me->next; me->next_rule = rule; return rule; } static int add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { struct radix_node_head *rnh; struct table_entry *ent; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); if (ent == NULL) return (ENOMEM); ent->value = value; ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; IPFW_WLOCK(&layer3_chain); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { IPFW_WUNLOCK(&layer3_chain); free(ent, M_IPFW_TBL); return (EEXIST); } IPFW_WUNLOCK(&layer3_chain); return (0); } static int del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa, mask; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; sa.sin_len = mask.sin_len = 8; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; IPFW_WLOCK(ch); ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); if (ent == NULL) { IPFW_WUNLOCK(ch); return (ESRCH); } IPFW_WUNLOCK(ch); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static int flush_table(struct ip_fw_chain *ch, uint16_t tbl) { struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; KASSERT(rnh != NULL, ("NULL IPFW table")); rnh->rnh_walktree(rnh, flush_table_entry, rnh); return (0); } static void flush_tables(struct ip_fw_chain *ch) { uint16_t tbl; IPFW_WLOCK_ASSERT(ch); for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) flush_table(ch, tbl); } static int init_tables(struct ip_fw_chain *ch) { int i; uint16_t j; for (i = 0; i < IPFW_TABLES_MAX; i++) { if (!rn_inithead((void **)&ch->tables[i], 32)) { for (j = 0; j < i; j++) { (void) flush_table(ch, j); } return (ENOMEM); } } return (0); } static int lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa; if (tbl >= IPFW_TABLES_MAX) return (0); rnh = ch->tables[tbl]; sa.sin_len = 8; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); if (ent != NULL) { *val = ent->value; return (1); } return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } static int count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; *cnt = 0; rnh->rnh_walktree(rnh, count_table_entry, cnt); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } static int dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl->tbl]; tbl->cnt = 0; rnh->rnh_walktree(rnh, dump_table_entry, tbl); return (0); } static void fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp) { struct ucred *cr; if (inp->inp_socket != NULL) { cr = inp->inp_socket->so_cred; ugp->fw_prid = jailed(cr) ? cr->cr_prison->pr_id : -1; ugp->fw_uid = cr->cr_uid; ugp->fw_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, ugp->fw_groups, sizeof(ugp->fw_groups)); } } static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; int match; gid_t *gp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *lookup == 0) { - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { fill_ugid_cache(inp, ugp); *lookup = 1; } } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*lookup == -1) return (0); if (proto == IPPROTO_TCP) { wildcard = 0; pi = &tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = INPLOOKUP_WILDCARD; pi = &udbinfo; } else return 0; match = 0; if (*lookup == 0) { INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb != NULL) { - INP_LOCK(pcb); + INP_WLOCK(pcb); if (pcb->inp_socket != NULL) { fill_ugid_cache(pcb, ugp); *lookup = 1; } - INP_UNLOCK(pcb); + INP_WUNLOCK(pcb); } INP_INFO_RUNLOCK(pi); if (*lookup == 0) { /* * If the lookup did not yield any results, there * is no sense in coming back and trying again. So * we can set lookup to -1 and ensure that we wont * bother the pcb system again. */ *lookup = -1; return (0); } } if (insn->o.opcode == O_UID) match = (ugp->fw_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) { for (gp = ugp->fw_groups; gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++) if (*gp == (gid_t)insn->d[0]) { match = 1; break; } } else if (insn->o.opcode == O_JAIL) match = (ugp->fw_prid == (int)insn->d[0]); return match; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, or NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->cookie a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state during the processing of a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ struct ip_fw_ugid fw_ugid_cache; int ugid_lookup = 0; /* * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG * associated with a packet input on a divert socket. This * will allow to distinguish traffic and its direction when * it originates from a divert socket. */ u_int divinput_flags = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset == 0 means there is no Fragment Header. * If offset != 0 for IPv6 always use correct mask to * get the correct offset because we add IP6F_MORE_FRAG * to be able to dectect the first fragment which would * otherwise have offset = 0. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ u_int16_t ip_len=0; int pktlen; u_int16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &layer3_chain; struct m_tag *mtag; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; u_int16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; if (m->m_flags & M_SKIP_FIREWALL) return (IP_FW_PASS); /* accept */ pktlen = m->m_pkthdr.len; proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(len, p, T) \ do { \ int x = (len) + sizeof(T); \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); args->f_id.flags = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; /* Add IP6F_MORE_FRAG for offset of first * fragment to be != 0. */ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.frag_id6 = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; if (args->eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { offset = ip->ip_off & IP_OFFMASK; ip_len = ip->ip_len; } pktlen = ip_len < pktlen ? ip_len : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (args->rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. * * If fw_one_pass != 0 then just accept it. * XXX should not happen here, but optimized out in * the caller. */ if (fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } f = args->rule->next_rule; if (f == NULL) f = lookup_next_rule(args->rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ int skipto = mtag ? divert_cookie(mtag) : 0; f = chain->rules; if (args->eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) { IPFW_RUNLOCK(chain); return (IP_FW_DENY); /* invalid */ } while (f && f->rulenum <= skipto) f = f->next; if (f == NULL) { /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } } } /* reset divert rule to avoid confusion later */ if (mtag) { divinput_flags = divert_info(mtag) & (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG); m_tag_delete(m, mtag); } /* * Now scan the rules, and parse microinstructions for each rule. */ for (; f; f = f->next) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ again: if (set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ check_body: cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset!=0) break; if (is_ipv6) /* XXX to be fixed later */ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &fw_ugid_cache, &ugid_lookup, args->inp); break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: match = (cmd->arg1 & 1 && divinput_flags & IP_FW_DIVERT_LOOPBACK_FLAG) || (cmd->arg1 & 2 && divinput_flags & IP_FW_DIVERT_OUTPUT_FLAG); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v; match = lookup_table(chain, cmd->arg1, a, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); } break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); } break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = ip_len; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = ip_len - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == TCP(ulp)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; at = pf_get_mtag(m); if (at == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at->qid = altq->qid; if (is_ipv4) at->af = AF_INET; else at->af = AF_LINK; at->hdr = ip; break; } case O_LOG: if (fw_verbose) ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL) : #endif verify_path(src_ip, NULL))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); break; case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); } else if (mtag == NULL) { if ((mtag = m_tag_alloc(MTAG_IPFW, tag, 0, M_NOWAIT)) != NULL) m_tag_prepend(m, mtag); } match = (cmd->len & F_NOT) ? 0: 1; break; } case O_TAGGED: { uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to do a 'goto done'). * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * ('goto next_rule', equivalent to a 'break 2'), * or to the SKIPTO target ('goto again' after * having set f, cmd and l), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet * must be dropped * ('goto done' after setting retval); * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * ('goto check_body') if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found ('goto next_rule'). * The result of the lookup is cached to make * further instances of these opcodes are * effectively NOPs. */ case O_LIMIT: case O_KEEP_STATE: if (install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { retval = IP_FW_DENY; goto done; /* error/limit violation */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule. */ q->pcnt++; q->bcnt += pktlen; f = q->rule; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; IPFW_DYN_UNLOCK(); goto check_body; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) goto next_rule; match = 1; break; case O_ACCEPT: retval = 0; /* accept */ goto done; case O_PIPE: case O_QUEUE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = IP_FW_DUMMYNET; goto done; case O_DIVERT: case O_TEE: { struct divert_tag *dt; if (args->eh) /* not on layer 2 */ break; mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT); if (mtag == NULL) { /* XXX statistic */ /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } dt = (struct divert_tag *)(mtag+1); dt->cookie = f->rulenum; if (cmd->arg1 == IP_FW_TABLEARG) dt->info = tablearg; else dt->info = cmd->arg1; m_tag_prepend(m, mtag); retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; goto done; } case O_COUNT: case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; if (cmd->opcode == O_COUNT) goto next_rule; /* handle skipto */ if (f->next_rule == NULL) lookup_next_rule(f); f = f->next_rule; goto again; case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, ip_len, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(args->f_id.flags) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; goto done; case O_FORWARD_IP: { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) { if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; } goto done; case O_NETGRAPH: case O_NGTEE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; goto done; case O_NAT: { struct cfg_nat *t; int nat_id; if (IPFW_NAT_LOADED) { args->rule = f; /* Report matching rule. */ t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; LOOKUP_NAT(layer3_chain, nat_id, t); if (t == NULL) { retval = IP_FW_DENY; goto done; } if (cmd->arg1 != IP_FW_TABLEARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); } else retval = IP_FW_DENY; goto done; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner for, scan opcodes */ next_rule:; /* try next rule */ } /* end of outer for, scan rules */ printf("ipfw: ouch!, skip past end of rules, denying packet\n"); IPFW_RUNLOCK(chain); return (IP_FW_DENY); done: /* Update statistics */ f->pcnt++; f->bcnt += pktlen; f->timestamp = time_uptime; IPFW_RUNLOCK(chain); return (retval); pullup_failed: if (fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * When a rule is added/deleted, clear the next_rule pointers in all rules. * These will be reconstructed on the fly as packets are matched. */ static void flush_rule_ptrs(struct ip_fw_chain *chain) { struct ip_fw *rule; IPFW_WLOCK_ASSERT(chain); for (rule = chain->rules; rule; rule = rule->next) rule->next_rule = NULL; } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. */ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) return (EINVAL); rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO); if (rule == NULL) return (ENOSPC); bcopy(input_rule, rule, l); rule->next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; IPFW_WLOCK(chain); if (chain->rules == NULL) { /* default rule */ chain->rules = rule; goto done; } /* * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ if (autoinc_step < 1) autoinc_step = 1; else if (autoinc_step > 1000) autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default */ for (f = chain->rules; f; f = f->next) { if (f->rulenum == IPFW_DEFAULT_RULE) break; rule->rulenum = f->rulenum; } if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) rule->rulenum += autoinc_step; input_rule->rulenum = rule->rulenum; } /* * Now insert the new rule in the right place in the sorted list. */ for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) { if (f->rulenum > rule->rulenum) { /* found the location */ if (prev) { rule->next = f; prev->next = rule; } else { /* head insert */ rule->next = chain->rules; chain->rules = rule; } break; } } flush_rule_ptrs(chain); done: static_count++; static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", rule->rulenum, static_count);) return (0); } /** * Remove a static rule (including derived * dynamic rules) * and place it on the ``reap list'' for later reclamation. * The caller is in charge of clearing rule pointers to avoid * dangling pointers. * @return a pointer to the next entry. * Arguments are not checked, so they better be correct. */ static struct ip_fw * remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { struct ip_fw *n; int l = RULESIZE(rule); IPFW_WLOCK_ASSERT(chain); n = rule->next; IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); if (prev == NULL) chain->rules = n; else prev->next = n; static_count--; static_len -= l; rule->next = chain->reap; chain->reap = rule; return n; } /** * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. */ static void reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; if (DUMMYNET_LOADED) ip_dn_ruledel_ptr(rule); free(rule, M_IPFW); } } /* * Remove all rules from a chain (except rules in set RESVD_SET * unless kill_default = 1). The caller is responsible for * reclaiming storage for the rules left in chain->reap. */ static void free_chain(struct ip_fw_chain *chain, int kill_default) { struct ip_fw *prev, *rule; IPFW_WLOCK_ASSERT(chain); flush_rule_ptrs(chain); /* more efficient to do outside the loop */ for (prev = NULL, rule = chain->rules; rule ; ) if (kill_default || rule->set != RESVD_SET) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } /** * Remove all rules with given number, and also do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an u_int32_t. The low 16 bit are the rule or set number, * the next 8 bits are the new set, the top 8 bits are the command: * * 0 delete rules with given number * 1 delete rules with given set number * 2 move rules with given number to new set * 3 move rules with given set number to new set * 4 swap sets with given numbers * 5 delete rules with given number and with given set number */ static int del_entry(struct ip_fw_chain *chain, u_int32_t arg) { struct ip_fw *prev = NULL, *rule; u_int16_t rulenum; /* rule or old_set */ u_int8_t cmd, new_set; rulenum = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 5 || new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2 || cmd == 5) { if (rulenum >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (rulenum > RESVD_SET) /* old_set */ return EINVAL; } IPFW_WLOCK(chain); rule = chain->rules; chain->reap = NULL; switch (cmd) { case 0: /* delete rules with given number */ /* * locate first rule to delete */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return EINVAL; } /* * flush pointers outside the loop, then delete all matching * rules. prev remains the same throughout the cycle. */ flush_rule_ptrs(chain); while (rule->rulenum == rulenum) rule = remove_rule(chain, rule, prev); break; case 1: /* delete all rules with given set number */ flush_rule_ptrs(chain); rule = chain->rules; while (rule->rulenum < IPFW_DEFAULT_RULE) if (rule->set == rulenum) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } break; case 2: /* move rules with given number to new set */ rule = chain->rules; for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->rulenum == rulenum) rule->set = new_set; break; case 3: /* move rules with given set number to new set */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; break; case 4: /* swap two sets */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; else if (rule->set == new_set) rule->set = rulenum; break; case 5: /* delete rules with given number and with given set number. * rulenum - given rule number; * new_set - given set number. */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return (EINVAL); } flush_rule_ptrs(chain); while (rule->rulenum == rulenum) { if (rule->set == new_set) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } } /* * Look for rules to reclaim. We grab the list before * releasing the lock then reclaim them w/o the lock to * avoid a LOR with dummynet. */ rule = chain->reap; chain->reap = NULL; IPFW_WUNLOCK(chain); if (rule) reap_rules(rule); return 0; } /* * Clear counters for a specific rule. * The enclosing "table" is assumed locked. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, * the next 8 bits are the set number, the top 8 bits are the command: * 0 work with rules from all set's; * 1 work with rules only from specified set. * Specified rule number is zero if we want to clear all entries. * log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { struct ip_fw *rule; char *msg; uint16_t rulenum = arg & 0xffff; uint8_t set = (arg >> 16) & 0xff; uint8_t cmd = (arg >> 24) & 0xff; if (cmd > 1) return (EINVAL); if (cmd == 1 && set > RESVD_SET) return (EINVAL); IPFW_WLOCK(chain); if (rulenum == 0) { norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) { /* Skip rules from another set. */ if (cmd == 1 && rule->set != set) continue; clear_counters(rule, log_only); } msg = log_only ? "ipfw: All logging counts reset.\n" : "ipfw: Accounting cleared.\n"; } else { int cleared = 0; /* * We can have multiple rules with the same number, so we * need to clear them all. */ for (rule = chain->rules; rule; rule = rule->next) if (rule->rulenum == rulenum) { while (rule && rule->rulenum == rulenum) { if (cmd == 0 || rule->set == set) clear_counters(rule, log_only); rule = rule->next; } cleared = 1; break; } if (!cleared) { /* we did not find any matching rules */ IPFW_WUNLOCK(chain); return (EINVAL); } msg = log_only ? "ipfw: Entry %d logging count reset.\n" : "ipfw: Entry %d cleared.\n"; } IPFW_WUNLOCK(chain); if (fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } /* * Check validity of the structure before insert. * Fortunately rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } DEB(printf("ipfw: opcode %d\n", cmd->opcode);) switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: case O_TAG: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TAGGED: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (!NG_IPFW_LOADED) return EINVAL; else goto check_size; case O_NAT: if (!IPFW_NAT_LOADED) return EINVAL; if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule; int i; time_t boot_seconds; boot_seconds = boottime.tv_sec; /* XXX this can take a long time and locking will block packet flow */ IPFW_RLOCK(chain); for (rule = chain->rules; rule ; rule = rule->next) { /* * Verify the entry fits in the buffer in case the * rules changed between calculating buffer space and * now. This would be better done using a generation * number but should suffice for now. */ i = RULESIZE(rule); if (bp + i <= ep) { bcopy(rule, bp, i); /* * XXX HACK. Store the disable mask in the "next" pointer * in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), sizeof(set_disable)); if (((struct ip_fw *)bp)->timestamp) ((struct ip_fw *)bp)->timestamp += boot_seconds; bp += i; } } IPFW_RUNLOCK(chain); if (ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets; i++) for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store set number into high word of * dst->rule pointer. */ bcopy(&(p->rule->set), (char *)&dst->rule + sizeof(p->rule->rulenum), sizeof(p->rule->set)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); } return (bp - (char *)buf); } /** * {set|get}sockopt parser. */ static int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size; struct ip_fw *buf, *rule; u_int32_t rulenum[2]; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ size = static_len; /* size of static rules */ if (ipfw_dyn_v) /* add size of dyn.rules */ size += (dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know * how much room is needed, do not bother filling up the * buffer, just jump to the sooptcopyout. */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, ipfw_getrules(&layer3_chain, buf, size)); free(buf, M_TEMP); break; case IP_FW_FLUSH: /* * Normally we cannot release the lock on each iteration. * We could do it here only because we start from the head all * the times so there is no risk of missing some entries. * On the other hand, the risk is that we end up with * a very inconsistent ruleset, so better keep the lock * around the whole cycle. * * XXX this code can be improved by resetting the head of * the list to point to the default rule, and then freeing * the old list without the need for a lock. */ IPFW_WLOCK(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 0 /* keep default rule */); rule = layer3_chain.reap; layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (rule != NULL) reap_rules(rule); break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw) ); if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { error = add_rule(&layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ error = del_entry(&layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ set_disable = (set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, rulenum, sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; } error = zero_entry(&layer3_chain, rulenum[0], sopt->sopt_name == IP_FW_RESETLOG); break; case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = add_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = del_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; IPFW_WLOCK(&layer3_chain); error = flush_table(&layer3_chain, tbl); IPFW_WUNLOCK(&layer3_chain); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; IPFW_RLOCK(&layer3_chain); error = count_table(&layer3_chain, tbl, &cnt); IPFW_RUNLOCK(&layer3_chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); IPFW_RLOCK(&layer3_chain); error = dump_table(&layer3_chain, tbl); IPFW_RUNLOCK(&layer3_chain); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; case IP_FW_NAT_CFG: { if (IPFW_NAT_LOADED) error = ipfw_nat_cfg_ptr(sopt); else { printf("IP_FW_NAT_CFG: ipfw_nat not present, please load it.\n"); error = EINVAL; } } break; case IP_FW_NAT_DEL: { if (IPFW_NAT_LOADED) error = ipfw_nat_del_ptr(sopt); else { printf("IP_FW_NAT_DEL: ipfw_nat not present, please load it.\n"); printf("ipfw_nat not loaded: %d\n", sopt->sopt_name); error = EINVAL; } } break; case IP_FW_NAT_GET_CONFIG: { if (IPFW_NAT_LOADED) error = ipfw_nat_get_cfg_ptr(sopt); else { printf("IP_FW_NAT_GET_CFG: ipfw_nat not present, please load it.\n"); error = EINVAL; } } break; case IP_FW_NAT_GET_LOG: { if (IPFW_NAT_LOADED) error = ipfw_nat_get_log_ptr(sopt); else { printf("IP_FW_NAT_GET_LOG: ipfw_nat not present, please load it.\n"); error = EINVAL; } } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } /** * dummynet needs a reference to the default rule, because rules can be * deleted while packets hold a reference to them. When this happens, * dummynet changes the reference to the default rule (it could well be a * NULL pointer, but this way we do not need to check for the special * case, plus here he have info on the default behaviour). */ struct ip_fw *ip_fw_default_rule; /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * __unused unused) { struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets ; i++) { for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ( time_uptime+dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ *mtailp = send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; *mtailp = send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; } } IPFW_DYN_UNLOCK(); for (m = mnext = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } done: callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); } int ipfw_init(void) { struct ip_fw default_rule; int error; #ifdef INET6 /* Setup IPv6 fw sysctl tree. */ sysctl_ctx_init(&ip6_fw_sysctl_ctx); ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, &fw_deny_unknown_exthdrs, 0, "Deny packets with unknown IPv6 Extension Headers"); #endif layer3_chain.rules = NULL; IPFW_LOCK_INIT(&layer3_chain); ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); callout_init(&ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = RESVD_SET; default_rule.cmd[0].len = 1; default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 1 ? O_ACCEPT : #endif O_DENY; error = add_rule(&layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_default_rule = layer3_chain.rules; printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); #ifdef IPFIREWALL_VERBOSE fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif if (fw_verbose == 0) printf("disabled\n"); else if (verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", verbose_limit); error = init_tables(&layer3_chain); if (error) { IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); LIST_INIT(&layer3_chain.nat); return (0); } void ipfw_destroy(void) { struct ip_fw *reap; ip_fw_chk_ptr = NULL; ip_fw_ctl_ptr = NULL; callout_drain(&ipfw_timeout); IPFW_WLOCK(&layer3_chain); flush_tables(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 1 /* kill default rule */); reap = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (reap != NULL) reap_rules(reap); IPFW_DYN_LOCK_DESTROY(); uma_zdestroy(ipfw_dyn_rule_zone); if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); IPFW_LOCK_DESTROY(&layer3_chain); #ifdef INET6 /* Free IPv6 fw sysctl tree. */ sysctl_ctx_free(&ip6_fw_sysctl_ctx); #endif printf("IP firewall unloaded\n"); } Index: head/sys/netinet/ip_options.c =================================================================== --- head/sys/netinet/ip_options.c (revision 178284) +++ head/sys/netinet/ip_options.c (revision 178285) @@ -1,684 +1,684 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ip_dosourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_RW, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW, &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; n_time ntime; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; /* Ignore or reject packets with IP options. */ if (ip_doopts == 0) return 0; else if (ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; } dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; ia = (struct in_ifaddr *) ifa_ifwithaddr((struct sockaddr *)&ipaddr); if (ia == NULL) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (ipstealth) goto dropit; #endif if (!ip_dosourceroute) { if (ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* * Acting as a router, so generate * ICMP */ nosourcerouting: strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_WARNING, "attempted source route from %s to %s\n", inet_ntoa(ip->ip_src), buf); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif ipstat.ips_cantforward++; m_freem(m); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL) ia = (INA)ifa_ifwithnet((SA)&ipaddr); } else ia = ip_rtaddr(ipaddr.sin_addr); if (ia == NULL) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } ip->ip_dst = ipaddr.sin_addr; (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL && (ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(n_time) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(n_time) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr((SA)&ipaddr) == NULL) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(n_time)); cp[IPOPT_OFFSET] += sizeof(n_time); } } if (forward && ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); ipstat.ips_badoptions++; return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. Second * argument is buffer to which options will be moved, and return value is * their length. * * XXX should be deleted; last arg currently ignored. */ void ip_stripoptions(struct mbuf *m, struct mbuf *mopt) { int i; struct ip *ip = mtod(m, struct ip *); caddr_t opts; int olen; olen = (ip->ip_hl << 2) - sizeof (struct ip); opts = (caddr_t)(ip + 1); i = m->m_len - (sizeof (struct ip) + olen); bcopy(opts + olen, opts, (unsigned)i); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ip->ip_len > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { MGETHDR(n, M_DONTWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } M_MOVE_PKTHDR(n, m); n->m_pkthdr.rcvif = NULL; #ifdef MAC mac_mbuf_copy(m, n); #endif n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len += optlen; return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = 0; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 178284) +++ head/sys/netinet/ip_output.c (revision 178285) @@ -1,1193 +1,1193 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); u_short ip_id; #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int len, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct route iproute; struct in_addr odst; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; #endif M_ASSERTPKTHDR(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (inp != NULL) - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; again: /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } #ifdef IPFIREWALL_FORWARD if (ro->ro_rt == NULL && fwd_tag == NULL) { #else if (ro->ro_rt == NULL) { #endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (ro->ro_rt == NULL) #ifdef RADIX_MPATH rtalloc_mpath(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr)); #else rtalloc_ign(ro, 0); #endif if (ro->ro_rt == NULL) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; mtu = ro->ro_rt->rt_rmx.rmx_mtu; } else { mtu = ifp->if_mtu; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { IN_MULTI_UNLOCK(); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { IN_MULTI_UNLOCK(); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ #ifdef ALTQ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= ifp->if_snd.ifq_maxlen)) #else if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= ifp->if_snd.ifq_maxlen) #endif /* ALTQ */ { error = ENOBUFS; ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip->ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } else goto again; /* Redo the routing table lookup. */ } #ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag) { dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m_tag_delete(m, fwd_tag); goto again; } #endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip->ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) ia->ia_ifa.if_opackets += m->m_pkthdr.len / m->m_pkthdr.tso_segsz; else ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; ipstat.ips_cantfrag++; goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); } return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ ipstat.ips_cantfrag++; return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copy(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip->ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= ip->ip_len) { /* last fragment */ len = ip->ip_len - off; m->m_flags |= M_LASTFRAG; } else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) mhip->ip_sum = in_cksum(m, mhlen); *mnext = m; mnext = &m->m_nextpkt; } ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip->ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, hlen); done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the appropriate next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } - INP_LOCK(inp); + INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ - INP_LOCK(inp); \ + INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ - INP_UNLOCK(inp); \ + INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; } break; #undef OPTSET /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; - INP_LOCK(inp); + INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec4_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, int hlen) { register struct ip *ip; struct mbuf *copym; copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); /* * NB: * It's not clear whether there are any lingering * reentrancy problems in other areas which might * be exposed by using ip_input directly (in * particular, everything which modifies the packet * in-place). Yet another option is using the * protosw directly to deliver the looped back * packet. For the moment, we'll err on the side * of safety by using if_simloop(). */ #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif #ifdef notdef copym->m_pkthdr.rcvif = ifp; ip_input(copym); #else if_simloop(ifp, copym, dst->sin_family, 0); #endif } } Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 178284) +++ head/sys/netinet/raw_ip.c (revision 178285) @@ -1,918 +1,918 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; /* * hooks for multicast routing. They all default to NULL, * so leave them not initialized and rely on BSS being set to 0. */ /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter; /* The various mrouter and rsvp functions */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(int, caddr_t); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); void (*rsvp_input_p)(struct mbuf *m, int off); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); LIST_INIT(&ripcb); ripcbinfo.ipi_listhead = &ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ ripcbinfo.ipi_hashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_hashmask); ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_porthashmask); ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; static int raw_append(struct inpcb *last, struct ip *ip, struct mbuf *n) { int policyfail = 0; - INP_LOCK_ASSERT(last); + INP_WLOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return policyfail; } /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ void rip_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; INP_INFO_RLOCK(&ripcbinfo); ripsrc.sin_addr = ip->ip_src; last = NULL; LIST_FOREACH(inp, &ripcb, inp_list) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_ip_p && inp->inp_ip_p != proto) { docontinue: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) goto docontinue; #endif if (inp->inp_laddr.s_addr && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) goto docontinue; if (inp->inp_faddr.s_addr && inp->inp_faddr.s_addr != ip->ip_src.s_addr) goto docontinue; if (jailed(inp->inp_socket->so_cred)) if (htonl(prison_getip(inp->inp_socket->so_cred)) != ip->ip_dst.s_addr) goto docontinue; if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) raw_append(last, ip, n); /* XXX count dropped packet */ - INP_UNLOCK(last); + INP_WUNLOCK(last); } last = inp; } if (last != NULL) { if (raw_append(last, ip, m) != 0) ipstat.ips_delivered--; - INP_UNLOCK(last); + INP_WUNLOCK(last); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); } /* * Generate IP header and pass packet to ip_output. * Tack on options user may have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; /* * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) return(ENOBUFS); - INP_LOCK(inp); + INP_WLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = IP_DF; else ip->ip_off = 0; ip->ip_p = inp->inp_ip_p; ip->ip_len = m->m_pkthdr.len; if (jailed(inp->inp_socket->so_cred)) ip->ip_src.s_addr = htonl(prison_getip(inp->inp_socket->so_cred)); else ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } - INP_LOCK(inp); + INP_WLOCK(inp); ip = mtod(m, struct ip *); if (jailed(inp->inp_socket->so_cred)) { if (ip->ip_src.s_addr != htonl(prison_getip(inp->inp_socket->so_cred))) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); m_freem(m); return (EPERM); } } /* don't allow both user specified and setsockopt options, and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ip->ip_len > m->m_pkthdr.len) || (ip->ip_len < (ip->ip_hl << 2))) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); m_freem(m); return EINVAL; } if (ip->ip_id == 0) ip->ip_id = ip_newid(); /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return error; } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: /* * XXXRW: Isn't this checked one layer down? Yes, it * is. */ error = priv_check(curthread, PRIV_NETINET_IPFW); if (error != 0) return (error); if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_GET: error = priv_check(curthread, PRIV_NETINET_DUMMYNET); if (error != 0) return (error); if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: /* * XXXRW: Isn't this checked one layer down? */ error = priv_check(curthread, PRIV_NETINET_IPFW); if (error != 0) return (error); if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: error = priv_check(curthread, PRIV_NETINET_DUMMYNET); if (error != 0) return (error); if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, * and calls in_ifadown() to remove all routes corresponding to that address. * It also receives the PRC_IFUP messages from if_up() and reinstalls the * interface routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* * in_ifscrub kills the interface route. */ in_ifscrub(ia->ia_ifp, ia); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); break; } } break; case PRC_IFUP: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) return; flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; } } u_long rip_sendspace = 9216; u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return error; if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; INP_INFO_WLOCK(&ripcbinfo); error = in_pcballoc(so, &ripcbinfo); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = ip_defttl; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (so == ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); inp->inp_faddr.s_addr = INADDR_ANY; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (jailed(td->td_ucred)) { if (addr->sin_addr.s_addr == INADDR_ANY) addr->sin_addr.s_addr = htonl(prison_getip(td->td_ucred)); if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) return (EADDRNOTAVAIL); } if (TAILQ_EMPTY(&ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) return EADDRNOTAVAIL; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); inp->inp_laddr = addr->sin_addr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return EAFNOSUPPORT; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); inp->inp_faddr = addr->sin_addr; soisconnected(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); socantsendmore(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return EISCONN; } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return ENOTCONN; } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return rip_output(m, so, dst); } static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&ripcbinfo); gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; - + INP_INFO_RLOCK(&ripcbinfo); for (inp = LIST_FIRST(ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) { /* XXX held references? */ inp_list[i++] = inp; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&ripcbinfo); xig.xig_gen = ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 178284) +++ head/sys/netinet/tcp_input.c (revision 178285) @@ -1,3033 +1,3033 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #include static const int tcprexmtthresh = 3; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST on segments to closed ports"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); static int tcp_do_rfc3042 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); static int tcp_insecure_rst = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, &tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets"); int tcp_do_autorcvbuf = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); int tcp_autorcvbuf_inc = 16*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); int tcp_autorcvbuf_max = 256*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(struct mbuf *m, int off0) { struct tcphdr *th; struct ip *ip = NULL; struct ipovly *ipov; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; const int isipv6 = 0; #endif struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif to.to_flags = 0; tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: Avoid compiler warning. */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } /* Re-initialization for later version check */ ip->ip_v = IPVERSION; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); findpcb: INP_INFO_WLOCK_ASSERT(&tcbinfo); #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } else #endif /* IPFIREWALL_FORWARD */ { if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || tcp_log_in_vain == 2) { if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((blackhole == 1 && (thflags & TH_SYN)) || blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_LOCK(inp); + INP_WLOCK(inp); #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { ipsec6stat.in_polvio++; goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { ipsec4stat.in_polvio++; goto dropunlock; } #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch * stray or duplicate segments arriving late. If this segment * was a legitimate new connection attempt the old INPCB gets * removed and we can try again to find a listening socket. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; INP_INFO_WUNLOCK(&tcbinfo); return; } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef MAC - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) { #ifdef INET6 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); #endif } else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); bzero(&inc, sizeof(inc)); inc.inc_isipv6 = isipv6; #ifdef INET6 if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ if (!syncache_expand(&inc, &to, th, &so, m)) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, (tcp_sc_rst_sock_fail ? "sending RST" : "try again")); if (tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ - INP_UNLOCK(inp); /* listen socket */ + INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); - INP_LOCK(inp); /* new connection */ + INP_WLOCK(inp); /* new connection */ tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); INP_INFO_UNLOCK_ASSERT(&tcbinfo); return; } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); tcpstat.tcps_badsyn++; goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); tcpstat.tcps_badsyn++; goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } if (isipv6) { #ifdef INET6 if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } #endif } else { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); syncache_add(&inc, &to, th, inp, &so, m); /* * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). */ INP_INFO_UNLOCK_ASSERT(&tcbinfo); return; } /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); INP_INFO_UNLOCK_ASSERT(&tcbinfo); return; dropwithreset: INP_INFO_WLOCK_ASSERT(&tcbinfo); tcp_dropwithreset(m, th, tp, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ dropunlock: INP_INFO_WLOCK_ASSERT(&tcbinfo); if (inp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); drop: INP_INFO_UNLOCK_ASSERT(&tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return; } static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen) { int thflags, acked, ourfinisacked, needoutput = 0; int headlocked = 1; int rstreason, todrop, win; u_long tiwin; struct tcpopt to; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, ticks)) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT) && tp->t_dupacks < tcprexmtthresh) || ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("%s: headlocked", __func__)); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * This is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ KASSERT(headlocked, ("%s: headlocked", __func__)); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * This is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 2. received bytes per RTT is within seven eighth of the * current socket buffer size; * 3. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ if (tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (to.to_tsecr > tp->rfbuf_ts && to.to_tsecr - tp->rfbuf_ts < hz) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + tcp_autorcvbuf_inc, tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else tp->rfbuf_cnt += tlen; /* add up */ } /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, curthread)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /* Reset receive buffer auto scaling when not in bulk receive mode. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) tp = tcp_drop(tp, ECONNREFUSED); if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_socketpeer_set_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_state = TCPS_SYN_RECEIVED; } KASSERT(headlocked, ("%s: trimthenstep6: head not locked", __func__)); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (tcp_insecure_rst == 0 && !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { tcpstat.tcps_badrst++; goto drop; } /* FALLTHROUGH */ case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close.2: head not locked", __func__)); tp = tcp_close(tp); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { char *s; KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " "not locked", __func__)); if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " "was closed, sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " "head not locked", __func__)); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp))) { if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp)) { tp->t_dupacks = 0; break; } } else if (tcp_do_newreno) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: KASSERT(headlocked, ("%s: process_ACK: head not locked", __func__)); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || !IN_FASTRECOVERY(tp)) { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { int timeout; soisdisconnected(so); timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : tcp_maxidle; tcp_timer_activate(tp, TT_2MSL, timeout); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("%s: process_ACK: " "head not locked", __func__)); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { KASSERT(headlocked, ("%s: process_ACK: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); goto drop; } break; } } step6: KASSERT(headlocked, ("%s: step6: head not locked", __func__)); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("%s: dodata: " "TCP_FIN_WAIT_2: head not locked", __func__)); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); return; } } INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(headlocked == 0, ("%s: check_delack: head locked", __func__)); INP_INFO_UNLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); return; dropafterack: KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); INP_INFO_WUNLOCK(&tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); tcp_dropwithreset(m, th, tp, tlen, rstreason); if (tp != NULL) - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } else #endif { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; tcpstat.tcps_sack_rcv_blocks++; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(struct tcpcb *tp, int offer) { int rtt, mss; u_long bufsize; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct hc_metrics_lite metrics; int origoffer = offer; int mtuflags = 0; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) return; /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxopd above. */ offer = tp->t_maxopd; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); /* * If there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too. */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) tp->t_flags |= TF_TSO; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > th->th_ack - tp->snd_una) tp->snd_cwnd -= th->th_ack - tp->snd_una; else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 178284) +++ head/sys/netinet/tcp_output.c (revision 178285) @@ -1,1412 +1,1412 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #ifdef TCPDEBUG #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include #ifdef notyet extern struct mbuf *m_copypack(); #endif int path_mtu_discovery = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, &path_mtu_discovery, 1, "Enable Path MTU Discovery"); int ss_fltsz = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, &ss_fltsz, 1, "Slow start flight size"); int ss_fltsz_local = 4; SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, &ss_fltsz_local, 1, "Slow start flight size for local networks"); int tcp_do_newreno = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); int tcp_do_autosndbuf = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); int tcp_autosndbuf_inc = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); int tcp_autosndbuf_max = 256*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso = 0; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = ss_fltsz_local; } else #endif /* INET6 */ if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Truncate to the maximum segment length or enable TCP Segmentation * Offloading (if supported by hardware) and ensure that FIN is removed * if the length no longer contains the last data byte. * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * The length of TSO bursts is limited to TCP_MAXWIN. That limit and * removal of FIN (if not already catched here) are handled later after * the exact length of the TCP options are known. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if (len > tp->t_maxseg) { if ((tp->t_flags & TF_TSO) && tcp_do_tso && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL #ifdef IPSEC && ipsec_optlen == 0 #endif ) { tso = 1; } else { len = tp->t_maxseg; sendalot = 1; tso = 0; } } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. * Don't send pure window updates when the peer has closed * the connection and won't ever send more data. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ if ((tp->t_flags & TF_NOOPT) == 0) { to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = ticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = ticks; } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ #ifdef INET6 if (!isipv6 && (tp->t_flags & TF_SIGNATURE)) #else if (tp->t_flags & TF_SIGNATURE) #endif /* INET6 */ to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. * * When doing TSO limit a burst to TCP_MAXWIN minus the * IP, TCP and Options length to keep ip->ip_len from * overflowing. Prevent the last segment from being * fractional thus making them all equal sized and set * the flag to continue sending. TSO is disabled when * IP options or IPSEC are present. */ if (len + optlen + ipoptlen > tp->t_maxopd) { flags &= ~TH_FIN; if (tso) { if (len > TCP_MAXWIN - hdrlen - optlen) { len = TCP_MAXWIN - hdrlen - optlen; len = len - (len % (tp->t_maxopd - optlen)); sendalot = 1; } else if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } } /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, (int)len, max_linkhdr + hdrlen)) == 0) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } /* * m_copypack left space for our hdr; use it. */ m->m_len += hdrlen; m->m_data -= hdrlen; #else MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } } #endif m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(mb, moff, (int)len); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; goto out; } } #endif /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcpstat.tcps_sndurg++; else tcpstat.tcps_sndwinup++; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data then can be buffered prior to transmitting on * the connection. */ if (recwin == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) #endif if (tp->t_flags & TF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, sizeof(struct ip), len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen + len); else #endif /* INET6 */ { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { m->m_pkthdr.csum_flags = CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); } else #endif /* INET6 */ { ip->ip_len = m->m_pkthdr.len; #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. */ if (path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, tp->t_inpcb->inp_options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (!tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * * tcp_mtudisc() will find out the new MTU and as * its last action, initiate retransmission, so it * is important to not do so here. * * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * Disable it for this connection as too and * immediatly retry with MSS sized segments generated * by this function. */ if (tso) tp->t_flags &= ~TF_TSO; tcp_mtudisc(tp->t_inpcb, 0); return (0); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && (!tcp_do_newreno || --maxburst)) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } tcpstat.tcps_sack_send_blocks++; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: head/sys/netinet/tcp_reass.c =================================================================== --- head/sys/netinet/tcp_reass.c (revision 178284) +++ head/sys/netinet/tcp_reass.c (revision 178285) @@ -1,285 +1,285 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { tcp_reass_maxseg = nmbclusters / 16; uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; void tcp_reass_init(void) { tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || tp->t_segqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup_locked(so); return (flags); } Index: head/sys/netinet/tcp_sack.c =================================================================== --- head/sys/netinet/tcp_sack.c (revision 178284) +++ head/sys/netinet/tcp_sack.c (revision 178285) @@ -1,681 +1,681 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 */ /*- * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #include extern struct uma_zone *sack_hole_zone; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); int tcp_do_sack = 1; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack); static int tcp_sack_maxholes = 128; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, &tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); static int tcp_sack_globalmaxholes = 65536; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, &tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); static int tcp_sack_globalholes = 0; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, &tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; } /* * Delete all receiver-side SACK information. */ void tcp_clean_sackreport(struct tcpcb *tp) { int i; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); tp->rcv_numsacks = 0; for (i = 0; i < MAX_SACK_BLKS; i++) tp->sackblks[i].start = tp->sackblks[i].end=0; } /* * Allocate struct sackhole. */ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { struct sackhole *hole; if (tp->snd_numholes >= tcp_sack_maxholes || tcp_sack_globalholes >= tcp_sack_globalmaxholes) { tcpstat.tcps_sack_sboverflow++; return NULL; } hole = (struct sackhole *)uma_zalloc(sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; tcp_sack_globalholes++; return hole; } /* * Free struct sackhole. */ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { uma_zfree(sack_hole_zone, hole); tp->snd_numholes--; tcp_sack_globalholes--; KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); KASSERT(tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* * Insert new SACK hole into scoreboard. */ static struct sackhole * tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *after) { struct sackhole *hole; /* Allocate a new SACK hole. */ hole = tcp_sackhole_alloc(tp, start, end); if (hole == NULL) return NULL; /* Insert the new SACK hole into scoreboard. */ if (after != NULL) TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); else TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); /* Update SACK hint. */ if (tp->sackhint.nexthole == NULL) tp->sackhint.nexthole = hole; return hole; } /* * Remove SACK hole from scoreboard. */ static void tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) { /* Update SACK hint. */ if (tp->sackhint.nexthole == hole) tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); /* Remove this SACK hole. */ TAILQ_REMOVE(&tp->snd_holes, hole, scblink); /* Free this SACK hole. */ tcp_sackhole_free(tp, hole); } /* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). */ void tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) sack_blocks[num_sack_blks++] = sack; } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return; /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting upto 4+1 elements is less than * making upto 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) /* fack is advanced. */ tp->snd_fack = sblkp->end; /* We must have at least one SACK hole in scoreboard. */ KASSERT(!TAILQ_EMPTY(&tp->snd_holes), ("SACK scoreboard must not be empty")); cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } } /* * Free all SACK holes to clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *q; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) tcp_sackhole_remove(tp, q); tp->sackhint.sack_bytes_rexmit = 0; KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); KASSERT(tp->sackhint.nexthole == NULL, ("tp->sackhint.nexthole == NULL")); } /* * Partial ack handling within a sack recovery episode. Keeping this very * simple for now. When a partial ack is received, force snd_cwnd to a value * that will allow the sender to transmit no more than 2 segments. If * necessary, a better scheme can be adopted at a later point, but for now, * the goal is to prevent the sender from bursting a large amount of data in * the midst of sack recovery. */ void tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } #if 0 /* * Debug version of tcp_sack_output() that walks the scoreboard. Used for * now to sanity check the hint. */ static struct sackhole * tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *p; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = 0; TAILQ_FOREACH(p, &tp->snd_holes, scblink) { if (SEQ_LT(p->rxmit, p->end)) { if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ continue; } *sack_bytes_rexmt += (p->rxmit - p->start); break; } *sack_bytes_rexmt += (p->rxmit - p->start); } return (p); } #endif /* * Returns the next hole to retransmit and the number of retransmitted bytes * from the scoreboard. We store both the next hole and the number of * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK * reception). This avoids scoreboard traversals completely. * * The loop here will traverse *at most* one link. Here's the argument. For * the loop to traverse more than 1 link before finding the next hole to * retransmit, we would need to have at least 1 node following the current * hint with (rxmit == end). But, for all holes following the current hint, * (start == rxmit), since we have not yet retransmitted from them. * Therefore, in order to traverse more 1 link in the loop below, we need to * have at least one node following the current hint with (start == rxmit == * end). But that can't happen, (start == end) means that all the data in * that hole has been sacked, in which case, the hole would have been removed * from the scoreboard. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *hole = NULL; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { if (SEQ_LT(hole->rxmit, hole->end)) { tp->sackhint.nexthole = hole; break; } } out: return (hole); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; return; } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 178284) +++ head/sys/netinet/tcp_subr.c (revision 178285) @@ -1,2171 +1,2171 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); static int tcp_inflight_enable = 1; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_rttthresh; SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", "RTT threshold below which inflight will deactivate itself"); static int tcp_inflight_min = 6144; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; }; static uma_zone_t tcpcb_zone; MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.ipi_listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, &tcbinfo.ipi_hashmask); tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.ipi_porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR /* * These have to be type stable for the benefit of the timers. */ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_init(); syncache_init(); tcp_hc_init(); tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void tcp_fini(void *xtp) { callout_stop(&isn_callout); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else #endif { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; if (path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); else #endif /* INET6 */ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output_reset(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(&tp->t_timers->tt_rexmt); callout_stop(&tp->t_timers->tt_persist); callout_stop(&tp->t_timers->tt_keep); callout_stop(&tp->t_timers->tt_2msl); callout_stop(&tp->t_timers->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* Notify any offload devices of listener close */ if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_close(tp); in_pcbdrop(inp); tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; - INP_LOCK(inpb); + INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } - INP_UNLOCK(inpb); + INP_WUNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { m = syncache_pcbcount(); n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + ((m + n) + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); m = syncache_pcbcount(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) inp_list[i++] = inp; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count + pcb_count; INP_INFO_RUNLOCK(&tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, mapped = 0; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { - INP_LOCK(inp); + INP_WLOCK(inp); if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_flags = 0; /* IPv4 */ inc.inc_faddr = faddr; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. ip->ip_len has already * been swapped in icmp_input(). */ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); if (mtu < max(296, (tcp_minmss) + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) mtu = tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else { inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; INP_INFO_WLOCK(&tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&tcbinfo); } else in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; tcp_seq tcp_new_isn(struct tcpcb *tp) { u_int32_t md5_buffer[4]; tcp_seq new_isn; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); new_isn += isn_offset; ISN_UNLOCK(); return (new_isn); } /* * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary * to keep time flowing at a relatively constant rate. If the random * increments have already pushed us past the projected offset, do nothing. */ static void tcp_isn_tick(void *xtp) { u_int32_t projected_offset; ISN_LOCK(); projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100; if (SEQ_GT(projected_offset, isn_offset)) isn_offset = projected_offset; isn_offset_old = isn_offset; callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp; struct socket *so = inp->inp_socket; u_int maxmtu; u_int romtu; int mss; #ifdef INET6 int isipv6; #endif /* INET6 */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ romtu = #ifdef INET6 isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) : #endif /* INET6 */ tcp_maxmtu(&inp->inp_inc, NULL); if (!maxmtu) maxmtu = romtu; else maxmtu = min(maxmtu, romtu); if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return (inp); } mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return (inp); tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp); tcp_output_send(tp); return (inp); } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; rtalloc_ign(&sro, RTF_CLONING); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro.ro_rt); } return (maxmtu); } #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; rtalloc_ign((struct route *)&sro6, RTF_CLONING); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case and stabilization. * Without the slop we could be locking ourselves into a lower * bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). * * (4) Stability value (defaults to 20 = 2 maximal packets) can * be adjusted with a sysctl but typically only needs to be * on very slow connections. A value no smaller then 5 * should be used, but only reduce this default if you have * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * off0 offset to TCP header within the mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * This function is for IPv4 use only. Calling this function with an * IPv6 packet in the mbuf chain will yield undefined results. * * Return 0 if successful, otherwise return -1. * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 * right now. Another branch of this code exists which uses the SPD to * specify per-application flows but it is unstable. */ int tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; struct ippseudo ippseudo; MD5_CTX ctx; int doff; struct ip *ip; struct ipovly *ipovly; struct secasvar *sav; struct tcphdr *th; u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); bzero(&dst, sizeof(union sockaddr_union)); dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { printf("%s: SADB lookup failed for %s\n", __func__, inet_ntoa(dst.sin.sin_addr)); return (EINVAL); } MD5Init(&ctx); ipovly = (struct ipovly *)ip; th = (struct tcphdr *)((u_char *)ip + off0); doff = off0 + sizeof(struct tcphdr) + optlen; /* * Step 1: Update MD5 hash with IP pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, ip6_use_defzone); if (error) return (error); break; #endif case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: return (EINVAL); } INP_INFO_WLOCK(&tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else if (!(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ /* Is logging enabled? */ if (tcp_log_debug == 0 && tcp_log_in_vain == 0) return (NULL); s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && inc->inc_isipv6 == 0) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 178284) +++ head/sys/netinet/tcp_syncache.c (revision 178285) @@ -1,1752 +1,1752 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include static int tcp_syncookies = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, &tcp_syncookies, 0, "Use TCP SYN cookies if the syncache overflows"); static int tcp_syncookiesonly = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, &tcp_syncookiesonly, 0, "Use only TCP SYN cookies"); #define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ #define SYNCOOKIE_LIFETIME 16 /* seconds */ struct syncache { TAILQ_ENTRY(syncache) sc_hash; struct in_conninfo sc_inc; /* addresses */ int sc_rxttime; /* retransmit time */ u_int16_t sc_rxmits; /* retransmit counter */ u_int32_t sc_tsreflect; /* timestamp to reflect */ u_int32_t sc_ts; /* our timestamp to send */ u_int32_t sc_tsoff; /* ts offset w/ syncookies */ u_int32_t sc_flowlabel; /* IPv6 flowlabel */ tcp_seq sc_irs; /* seq from peer */ tcp_seq sc_iss; /* our ISS */ struct mbuf *sc_ipopts; /* source route */ u_int16_t sc_peer_mss; /* peer's MSS */ u_int16_t sc_wnd; /* advertised window */ u_int8_t sc_ip_ttl; /* IPv4 TTL */ u_int8_t sc_ip_tos; /* IPv4 TOS */ u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; u_int8_t sc_flags; #define SCF_NOOPT 0x01 /* no TCP options */ #define SCF_WINSCALE 0x02 /* negotiated window scaling */ #define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ /* MSS is implicit */ #define SCF_UNREACH 0x10 /* icmp unreachable received */ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ #define SCF_SACK 0x80 /* send SACK option */ #ifndef TCP_OFFLOAD_DISABLE struct toe_usrreqs *sc_tu; /* TOE operations */ void *sc_toepcb; /* TOE protocol block */ #endif #ifdef MAC struct label *sc_label; /* MAC label reference */ #endif }; #ifdef TCP_OFFLOAD_DISABLE #define TOEPCB_ISSET(sc) (0) #else #define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) #endif struct syncache_head { struct mtx sch_mtx; TAILQ_HEAD(sch_head, syncache) sch_bucket; struct callout sch_timer; int sch_nextc; u_int sch_length; u_int sch_oddeven; u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; u_int sch_reseed; /* time_uptime, seconds */ }; static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); static int syncache_respond(struct syncache *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static void syncookie_generate(struct syncache_head *, struct syncache *, u_int32_t *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcpopt *, struct tcphdr *, struct socket *); /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 struct tcp_syncache { struct syncache_head *hashbase; uma_zone_t zone; u_int hashsize; u_int hashmask; u_int bucket_limit; u_int cache_count; /* XXX: unprotected */ u_int cache_limit; u_int rexmt_limit; u_int hash_secret; }; static struct tcp_syncache tcp_syncache; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); int tcp_sc_rst_sock_fail = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_RW, &tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ ((tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ ((tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define ENDPTS_EQ(a, b) ( \ (a)->ie_fport == (b)->ie_fport && \ (a)->ie_lport == (b)->ie_lport && \ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ ) #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(tcp_syncache.zone, sc); } void syncache_init(void) { int i; tcp_syncache.cache_count = 0; tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &tcp_syncache.bucket_limit); if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } tcp_syncache.hashmask = tcp_syncache.hashsize - 1; /* Set limits. */ tcp_syncache.cache_limit = tcp_syncache.hashsize * tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &tcp_syncache.cache_limit); /* Allocate the hash table. */ MALLOC(tcp_syncache.hashbase, struct syncache_head *, tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); /* Initialize the hash buckets. */ for (i = 0; i < tcp_syncache.hashsize; i++) { TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket); mtx_init(&tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&tcp_syncache.hashbase[i].sch_timer, &tcp_syncache.hashbase[i].sch_mtx, 0); tcp_syncache.hashbase[i].sch_length = 0; } /* Create the syncache entry zone. */ tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit); } /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); tcpstat.tcps_sc_bucketoverflow++; } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); tcp_syncache.cache_count++; tcpstat.tcps_sc_added++; } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifndef TCP_OFFLOAD_DISABLE if (sc->sc_tu) sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); #endif syncache_free(sc); tcp_syncache.cache_count--; } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; int tick = ticks; char *s; /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); tcpstat.tcps_sc_stale++; continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } (void) syncache_respond(sc); tcpstat.tcps_sc_retransmitted++; syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; #ifdef INET6 if (inc->inc_isipv6) { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } else #endif { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { #ifdef INET6 if (sc->sc_inc.inc_isipv6) continue; #endif if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } SCH_LOCK_ASSERT(*schp); return (NULL); /* always returns with locked sch */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); tcpstat.tcps_badrst++; goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); tcpstat.tcps_badrst++; goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); tcpstat.tcps_sc_reset++; } else if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != IRS %u " "(+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); tcpstat.tcps_badrst++; } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); tcpstat.tcps_sc_badack++; } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); tcpstat.tcps_sc_unreach++; done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; char *s; INP_INFO_WLOCK_ASSERT(&tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, SS_ISCONNECTED); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ tcpstat.tcps_listendrop++; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC SOCK_LOCK(so); mac_socketpeer_set_from_mbuf(m, so); SOCK_UNLOCK(so); #endif inp = sotoinpcb(so); - INP_LOCK(inp); + INP_WLOCK(inp); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif inp->inp_lport = sc->sc_inc.inc_lport; if (in_pcbinshash(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_isipv6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; goto abort; } #ifdef IPSEC /* Copy old policy into new socket's. */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if (in6_pcbconnect(inp, (struct sockaddr *)&sin6, thread0.td_ucred)) { inp->in6p_laddr = laddr6; goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; inp->in6p_flowinfo |= sc->sc_flowlabel; } else #endif { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if (in_pcbconnect(inp, (struct sockaddr *)&sin, thread0.td_ucred)) { inp->inp_laddr = laddr; goto abort; } } tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = ticks; tp->ts_offset = sc->sc_tsoff; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. */ if (sc->sc_rxmits) tp->snd_cwnd = tp->t_maxseg; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); tcpstat.tcps_accepts++; return (so); abort: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } } else { /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; tcp_syncache.cache_count--; SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must match the received initial receive sequence * number + 1 (the SYN) because we didn't ACK any data that * may have come with the SYN. */ if (th->th_seq != sc->sc_irs + 1 && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) tcpstat.tcps_sc_aborted++; else tcpstat.tcps_sc_completed++; if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ static void _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, struct toe_usrreqs *tu, void *toepcb) { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int32_t flowtmp; int win, sb_hiwat, ip_ttl, ip_tos, noopt; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); /* listen socket */ + INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; tp = sototcpcb(so); #ifdef INET6 if (inc->inc_isipv6 && (inp->in6p_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; win = sbspace(&so->so_rcv); sb_hiwat = so->so_rcv.sb_hiwat; noopt = (tp->t_flags & TF_NOOPT); so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); goto done; } else mac_syncache_create(maclabel, inp); #endif - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); /* * Remember the IP options, if any. */ #ifdef INET6 if (!inc->inc_isipv6) #endif ipopts = (m) ? ip_srcroute(m) : NULL; /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { #ifndef TCP_OFFLOAD_DISABLE if (sc->sc_tu) sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, sc->sc_toepcb); #endif tcpstat.tcps_sc_dupsyn++; if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); KASSERT(sc->sc_label != NULL, ("%s: label not initialized", __func__)); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); tcpstat.tcps_sndacks++; tcpstat.tcps_sndtotal++; } SCH_UNLOCK(sch); goto done; } sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ tcpstat.tcps_sc_zonefail++; if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!inc->inc_isipv6) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifndef TCP_OFFLOAD_DISABLE sc->sc_tu = tu; sc->sc_toepcb = toepcb; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = ticks; sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatiblity problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #ifdef TCP_SIGNATURE /* * If listening socket requested TCP digests, and received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. * XXX: Currently we always record the option by default and will * attempt to use it in syncache_respond(). */ if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (noopt) sc->sc_flags |= SCF_NOOPT; if (tcp_syncookies) { syncookie_generate(sch, sc, &flowtmp); #ifdef INET6 if (autoflowlabel) sc->sc_flowlabel = flowtmp; #endif } else { #ifdef INET6 if (autoflowlabel) sc->sc_flowlabel = (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); #endif } SCH_UNLOCK(sch); /* * Do a standard 3-way handshake. */ if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { if (tcp_syncookies && tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ tcpstat.tcps_sndacks++; tcpstat.tcps_sndtotal++; } else { if (sc != &scs) syncache_free(sc); tcpstat.tcps_sc_dropped++; } done: #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif if (m) { *lsop = NULL; m_freem(m); } return; } static int syncache_respond(struct syncache *sc) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th; int optlen, error; u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } else #endif { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = tlen; ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= IP_DF; th = (struct tcphdr *)(ip + 1); } th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tcp_signature_compute(m, sizeof(struct ip), 0, optlen, to.to_signature, IPSEC_DIR_OUTBOUND); #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len += optlen; } else optlen = 0; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen + optlen - hlen); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } return (error); } void syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m) { _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); } void syncache_offload_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct toe_usrreqs *tu, void *toepcb) { _syncache_add(inc, to, th, inp, lsop, NULL, tu, toepcb); } /* * The purpose of SYN cookies is to avoid keeping track of all SYN's we * receive and to be able to handle SYN floods from bogus source addresses * (where we will never receive any reply). SYN floods try to exhaust all * our memory and available slots in the SYN cache table to cause a denial * of service to legitimate users of the local host. * * The idea of SYN cookies is to encode and include all necessary information * about the connection setup state within the SYN-ACK we send back and thus * to get along without keeping any local state until the ACK to the SYN-ACK * arrives (if ever). Everything we need to know should be available from * the information we encoded in the SYN-ACK. * * More information about the theory behind SYN cookies and its first * discussion and specification can be found at: * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (gory details) * * This implementation extends the orginal idea and first implementation * of FreeBSD by using not only the initial sequence number field to store * information but also the timestamp field if present. This way we can * keep track of the entire state we need to know to recreate the session in * its original form. Almost all TCP speakers implement RFC1323 timestamps * these days. For those that do not we still have to live with the known * shortcomings of the ISN only SYN cookies. * * Cookie layers: * * Initial sequence number we send: * 31|................................|0 * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP * D = MD5 Digest (first dword) * M = MSS index * R = Rotation of secret * P = Odd or Even secret * * The MD5 Digest is computed with over following parameters: * a) randomly rotated secret * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6) * c) the received initial sequence number from remote host * d) the rotation offset and odd/even bit * * Timestamp we send: * 31|................................|0 * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5 * D = MD5 Digest (third dword) (only as filler) * S = Requested send window scale * R = Requested receive window scale * A = SACK allowed * 5 = TCP-MD5 enabled (not implemented yet) * XORed with MD5 Digest (forth dword) * * The timestamp isn't cryptographically secure and doesn't need to be. * The double use of the MD5 digest dwords ties it to a specific remote/ * local host/port, remote initial sequence number and our local time * limited secret. A received timestamp is reverted (XORed) and then * the contained MD5 dword is compared to the computed one to ensure the * timestamp belongs to the SYN-ACK we sent. The other parameters may * have been tampered with but this isn't different from supplying bogus * values in the SYN in the first place. * * Some problems with SYN cookies remain however: * Consider the problem of a recreated (and retransmitted) cookie. If the * original SYN was accepted, the connection is established. The second * SYN is inflight, and if it arrives with an ISN that falls within the * receive window, the connection is killed. * * Notes: * A heuristic to determine when to accept syn cookies is not necessary. * An ACK flood would cause the syncookie verification to be attempted, * but a SYN flood causes syncookies to be generated. Both are of equal * cost, so there's no point in trying to optimize the ACK flood case. * Also, if you don't process certain ACKs for some reason, then all someone * would have to do is launch a SYN and ACK flood at the same time, which * would stop cookie verification and defeat the entire purpose of syncookies. */ static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 }; static void syncookie_generate(struct syncache_head *sch, struct syncache *sc, u_int32_t *flowlabel) { MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data; u_int32_t *secbits; u_int off, pmss, mss; int i; SCH_LOCK_ASSERT(sch); /* Which of the two secrets to use. */ secbits = sch->sch_oddeven ? sch->sch_secbits_odd : sch->sch_secbits_even; /* Reseed secret if too old. */ if (sch->sch_reseed < time_uptime) { sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */ secbits = sch->sch_oddeven ? sch->sch_secbits_odd : sch->sch_secbits_even; for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++) secbits[i] = arc4random(); sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME; } /* Secret rotation offset. */ off = sc->sc_iss & 0x7; /* iss was randomized before */ /* Maximum segment size calculation. */ pmss = max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), tcp_minmss); for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) if (tcp_sc_msstab[mss] <= pmss) break; /* Fold parameters and MD5 digest into the ISN we will send. */ data = sch->sch_oddeven;/* odd or even secret, 1 bit */ data |= off << 1; /* secret offset, derived from iss, 3 bits */ data |= mss << 4; /* mss, 3 bits */ MD5Init(&ctx); MD5Update(&ctx, ((u_int8_t *)secbits) + off, SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); MD5Update(&ctx, secbits, off); MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc)); MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs)); MD5Update(&ctx, &data, sizeof(data)); MD5Final((u_int8_t *)&md5_buffer, &ctx); data |= (md5_buffer[0] << 7); sc->sc_iss = data; #ifdef INET6 *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; #endif /* Additional parameters are stored in the timestamp if present. */ if (sc->sc_flags & SCF_TIMESTAMP) { data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */ data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */ data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */ data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */ data |= md5_buffer[2] << 10; /* more digest bits */ data ^= md5_buffer[3]; sc->sc_ts = data; sc->sc_tsoff = data - ticks; /* after XOR */ } tcpstat.tcps_sc_sendcookie++; return; } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcpopt *to, struct tcphdr *th, struct socket *so) { MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data = 0; u_int32_t *secbits; tcp_seq ack, seq; int off, mss, wnd, flags; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and * revert sequence number advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; off = (ack >> 1) & 0x7; mss = (ack >> 4) & 0x7; flags = ack & 0x7f; /* Which of the two secrets to use. */ secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even; /* * The secret wasn't updated for the lifetime of a syncookie, * so this SYN-ACK/ACK is either too old (replay) or totally bogus. */ if (sch->sch_reseed < time_uptime) { return (NULL); } /* Recompute the digest so we can compare it. */ MD5Init(&ctx); MD5Update(&ctx, ((u_int8_t *)secbits) + off, SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); MD5Update(&ctx, secbits, off); MD5Update(&ctx, inc, sizeof(*inc)); MD5Update(&ctx, &seq, sizeof(seq)); MD5Update(&ctx, &flags, sizeof(flags)); MD5Final((u_int8_t *)&md5_buffer, &ctx); /* Does the digest part of or ACK'ed ISS match? */ if ((ack & (~0x7f)) != (md5_buffer[0] << 7)) return (NULL); /* Does the digest part of our reflected timestamp match? */ if (to->to_flags & TOF_TS) { data = md5_buffer[3] ^ to->to_tsecr; if ((data & (~0x3ff)) != (md5_buffer[2] << 10)) return (NULL); } /* Fill in the syncache values. */ bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; #ifdef INET6 if (inc->inc_isipv6) { if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; } else #endif { sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos; } /* Additional parameters that were encoded in the timestamp. */ if (data) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - ticks; sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0; sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0; sc->sc_requested_s_scale = min((data >> 2) & 0xf, TCP_MAX_WINSHIFT); sc->sc_requested_r_scale = min((data >> 6) & 0xf, TCP_MAX_WINSHIFT); if (sc->sc_requested_s_scale || sc->sc_requested_r_scale) sc->sc_flags |= SCF_WINSCALE; } else sc->sc_flags |= SCF_NOOPT; wnd = sbspace(&so->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; sc->sc_rxmits = 0; sc->sc_peer_mss = tcp_sc_msstab[mss]; tcpstat.tcps_sc_recvcookie++; return (sc); } /* * Returns the current number of syncache entries. This number * will probably change before you get around to calling * syncache_pcblist. */ int syncache_pcbcount(void) { struct syncache_head *sch; int count, i; for (count = 0, i = 0; i < tcp_syncache.hashsize; i++) { /* No need to lock for a read. */ sch = &tcp_syncache.hashbase[i]; count += sch->sch_length; } return count; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < tcp_syncache.hashsize; i++) { sch = &tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_isipv6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.xt_tp.t_inpcb = &xt.xt_inp; xt.xt_tp.t_state = TCPS_SYN_RECEIVED; xt.xt_socket.xso_protocol = IPPROTO_TCP; xt.xt_socket.xso_len = sizeof (struct xsocket); xt.xt_socket.so_type = SOCK_STREAM; xt.xt_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 178284) +++ head/sys/netinet/tcp_timer.c (revision 178285) @@ -1,630 +1,630 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); static int always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); static int tcp_keepcnt = TCPTV_KEEPCNT; /* max idle probes */ int tcp_maxpersistidle; /* max idle time in persist */ int tcp_maxidle; /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { tcp_maxidle = tcp_keepcnt * tcp_keepintvl; INP_INFO_WLOCK(&tcbinfo); (void) tcp_tw_2msl_scan(0); INP_INFO_WUNLOCK(&tcbinfo); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ static int tcp_timer_race; SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, 0, "Count of t_inpcb races on tcp_discardcb"); /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; INP_INFO_RLOCK(&tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_RUNLOCK(&tcbinfo); return; } - INP_LOCK(inp); + INP_WLOCK(inp); INP_INFO_RUNLOCK(&tcbinfo); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return; } callout_deactivate(&tp->t_timers->tt_delack); tp->t_flags |= TF_ACKNOW; tcpstat.tcps_delack++; (void) tcp_output(tp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif /* * XXXRW: Does this actually happen? */ INP_INFO_WLOCK(&tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&tcbinfo); return; } - INP_LOCK(inp); + INP_WLOCK(inp); tcp_free_sackholes(tp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); INP_INFO_WUNLOCK(&tcbinfo); return; } callout_deactivate(&tp->t_timers->tt_2msl); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long, or if 2MSL time is up from TIME_WAIT, delete connection * control block. Otherwise, check again in a bit. * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { tcpstat.tcps_finwait2_drops++; tp = tcp_close(tp); } else { if (tp->t_state != TCPS_TIME_WAIT && (ticks - tp->t_rcvtime) <= tcp_maxidle) callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl, tcp_timer_2msl, tp); else tp = tcp_close(tp); } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_WLOCK(&tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&tcbinfo); return; } - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } callout_deactivate(&tp->t_timers->tt_keep); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ tcpstat.tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ tcpstat.tcps_keepprobe++; t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); (void) m_free(dtom(t_template)); } callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); } else callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; dropit: tcpstat.tcps_keepdrops++; tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) - INP_UNLOCK(tp->t_inpcb); + INP_WUNLOCK(tp->t_inpcb); INP_INFO_WUNLOCK(&tcbinfo); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_WLOCK(&tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&tcbinfo); return; } - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } callout_deactivate(&tp->t_timers->tt_persist); /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ tcpstat.tcps_persisttimeo++; /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; int rexmt; int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_WLOCK(&tcbinfo); headlocked = 1; inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&tcbinfo); return; } - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } callout_deactivate(&tp->t_timers->tt_rexmt); tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); goto out; } INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); } tcpstat.tcps_rexmttimeo++; if (tp->t_state == TCPS_SYN_SENT) rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * Disable rfc1323 if we havn't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; /* * Close the congestion window down to one segment * (we'll open it by one segment for each ack we get). * Since we probably have a window's worth of unacked * data accumulated, this "slow start" keeps us from * dumping all that data as back-to-back packets (which * might overwhelm an intermediate gateway). * * There are two phases to the opening: Initially we * open by one mss on each ack. This makes the window * size increase exponentially with time. If the * window is larger than the path can handle, this * exponential growth results in dropped packet(s) * almost immediately. To get more time between * drops but still "push" the network to take advantage * of improving conditions, we switch from exponential * to linear window opening at some threshhold size. * For a threshhold, we use half the current window * size, truncated to a multiple of the mss. * * (the minimum cwnd that will give us exponential * growth is 2 mss. We don't allow the threshhold * to go below this.) */ { u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_dupacks = 0; } EXIT_FASTRECOVERY(tp); (void) tcp_output(tp); out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); } void tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) { struct callout *t_callout; void *f_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; break; default: panic("bad timer_type"); } if (delta == 0) { callout_stop(t_callout); } else { callout_reset(t_callout, delta, f_callout, tp); } } int tcp_timer_active(struct tcpcb *tp, int timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: panic("bad timer_type"); } return callout_active(t_callout); } Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 178284) +++ head/sys/netinet/tcp_timewait.c (revision 178285) @@ -1,649 +1,649 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #include #include static uma_zone_t tcptw_zone; static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * tcbinfo lock, which must be held over queue iteration and modification. */ static TAILQ_HEAD(, tcptw) twq_2msl; static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *); static int tcptw_auto_size(void) { int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ if (ipport_lastauto > ipport_firstauto) halfrange = (ipport_lastauto - ipport_firstauto) / 2; else halfrange = (ipport_firstauto - ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = tcptw_auto_size(); else new = maxtcptw; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; uma_zone_set_max(tcptw_zone, maxtcptw); } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); static int nolocaltimewait = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, &nolocaltimewait, 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); void tcp_tw_zone_change(void) { if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, tcptw_auto_size()); } void tcp_tw_init(void) { tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(tcptw_zone, maxtcptw); TAILQ_INIT(&twq_2msl); } /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw *tw; struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_tw_2msl_reset(). */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (nolocaltimewait && in_localip(inp->inp_faddr)) { tp = tcp_close(tp); if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return; } tw = uma_zalloc(tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return; } } tw->tw_inpcb = inp; /* * Recover last window size sent. */ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); tw->tw_cred = crhold(so->so_cred); SOCK_LOCK(so); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_vflag |= INP_TIMEWAIT; tcp_tw_2msl_reset(tw, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } #if 0 /* * The appromixate rate of ISN increase of Microsoft TCP stacks; * the actual rate is slightly higher due to the addition of * random positive increments. * * Most other new OSes use semi-randomized ISN values, so we * do not need to worry about them. */ #define MS_ISN_BYTES_PER_SECOND 250000 /* * Determine if the ISN we will generate has advanced beyond the last * sequence number used by the previous connection. If so, indicate * that it is safe to recycle this tw socket by returning 1. */ int tcp_twrecycleable(struct tcptw *tw) { tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; INP_INFO_WLOCK_ASSERT(&tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) return (1); else return (0); } #endif /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { #ifdef INET6 struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; #endif } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (0); drop: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); m_freem(m); return (0); } void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_tw_2msl_stop(). */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); tw->tw_inpcb = NULL; tcp_tw_2msl_stop(tw); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_vflag & INP_SOCKREF) { inp->inp_vflag &= ~INP_SOCKREF; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } } else { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) in6_pcbfree(inp); else #endif in_pcbfree(inp); } tcpstat.tcps_closed++; crfree(tw->tw_cred); tw->tw_cred = NULL; if (reuse) return; uma_zfree(tcptw_zone, tw); } int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; struct tcphdr *th; struct mbuf *m; struct ip *ip = NULL; u_int hdrlen, optlen; int error; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_isipv6; #endif - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } to.to_flags = 0; /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; to.to_tsval = ticks + tw->ts_offset; to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); #ifdef INET6 if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; if (path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } if (flags & TH_ACK) tcpstat.tcps_sndacks++; else tcpstat.tcps_sndctrl++; tcpstat.tcps_sndtotal++; return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tw->tw_inpcb); + INP_WLOCK_ASSERT(tw->tw_inpcb); if (rearm) TAILQ_REMOVE(&twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&twq_2msl, tw, tw_2msl); } static void tcp_tw_2msl_stop(struct tcptw *tw) { INP_INFO_WLOCK_ASSERT(&tcbinfo); TAILQ_REMOVE(&twq_2msl, tw, tw_2msl); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; INP_INFO_WLOCK_ASSERT(&tcbinfo); for (;;) { tw = TAILQ_FIRST(&twq_2msl); if (tw == NULL || (!reuse && tw->tw_time > ticks)) break; - INP_LOCK(tw->tw_inpcb); + INP_WLOCK(tw->tw_inpcb); tcp_twclose(tw, reuse); if (reuse) return (tw); } return (NULL); } Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c (revision 178284) +++ head/sys/netinet/tcp_usrreq.c (revision 178285) @@ -1,1896 +1,1896 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #endif #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif #include /* * TCP protocol interface to socket abstraction. */ static int tcp_attach(struct socket *); static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); #ifdef TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() #define TCPDEBUG2(req) #endif /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); TCPDEBUG1(); error = tcp_attach(so); if (error) goto out; if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; inp = sotoinpcb(so); tp = intotcpcb(inp); out: TCPDEBUG2(PRU_ATTACH); return error; } /* * tcp_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. * * This function can probably be re-absorbed back into tcp_usr_detach() now * that there is a single detach path. */ static void tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; #ifdef INET6 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; #endif INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); tp = intotcpcb(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * There are two cases to handle: one in which the time wait * state is being discarded (INP_DROPPED), and one in which * this connection will remain in timewait. In the former, * it is time to discard all state (except tcptw, which has * already been discarded by the timewait close code, which * should be further up the call stack somewhere). In the * latter case, we detach from the socket, but leave the pcb * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? */ if (inp->inp_vflag & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " "INP_DROPPED && tp != NULL")); #ifdef INET6 if (isipv6) { in6_pcbdetach(inp); in6_pcbfree(inp); } else { #endif in_pcbdetach(inp); in_pcbfree(inp); #ifdef INET6 } #endif } else { #ifdef INET6 if (isipv6) in6_pcbdetach(inp); else #endif in_pcbdetach(inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } } else { /* * If the connection is not in timewait, we consider two * two conditions: one in which no further processing is * necessary (dropped || embryonic), and one in which TCP is * not yet done, but no longer requires the socket, so the * pcb will persist for the time being. * * XXXRW: Does the second case still occur? */ if (inp->inp_vflag & INP_DROPPED || tp->t_state < TCPS_SYN_SENT) { tcp_discardcb(tp); #ifdef INET6 if (isipv6) { in6_pcbdetach(inp); in6_pcbfree(inp); } else { #endif in_pcbdetach(inp); in_pcbfree(inp); #ifdef INET6 } #endif } else { #ifdef INET6 if (isipv6) in6_pcbdetach(inp); else #endif in_pcbdetach(inp); } } } /* * pru_detach() detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a pru_disconnect(), * which may finish later; embryonic TCB's can just * be discarded here. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); INP_INFO_WUNLOCK(&tcbinfo); } /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); error = in_pcbbind(inp, nam, td->td_ucred); out: TCPDEBUG2(PRU_BIND); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } } error = in6_pcbbind(inp, nam, td->td_ucred); out: TCPDEBUG2(PRU_BIND); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #endif /* INET6 */ /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); tcp_offload_listen_open(tp); } SOCK_UNLOCK(so); out: TCPDEBUG2(PRU_LISTEN); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); } SOCK_UNLOCK(so); out: TCPDEBUG2(PRU_LISTEN); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #endif /* INET6 */ /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; error = tcp_output_connect(so, nam); out: TCPDEBUG2(PRU_CONNECT); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; TCPDEBUG0; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; error = tcp_output_connect(so, nam); goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_isipv6 = 1; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; error = tcp_output_connect(so, nam); out: TCPDEBUG2(PRU_CONNECT); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } /* * Accept a connection. Essentially all the work is * done at higher levels; just return the address * of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct in_addr addr; in_port_t port = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_INFO_RLOCK(&tcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCPDEBUG2(PRU_ACCEPT); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; in_port_t port = 0; int v4 = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCPDEBUG2(PRU_ACCEPT); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; INP_INFO_WLOCK(&tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); error = tcp_output_disconnect(tp); out: TCPDEBUG2(PRU_SHUTDOWN); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_output_rcvd(tp); out: TCPDEBUG2(PRU_RCVD); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* * We require the pcbinfo lock in two cases: * * (1) An implied connect is taking place, which can result in * binding IPs and ports and hence modification of the pcb hash * chains. * * (2) PRUS_EOF is set, resulting in explicit close on the send. */ if ((nam != NULL) || (flags & PRUS_EOF)) { INP_INFO_WLOCK(&tcbinfo); headlocked = 1; } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); if (m) m_freem(m); error = ECONNRESET; goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (headlocked) { INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } if (tp != NULL) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output_send(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } else if (nam) { INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; error = tcp_output_send(tp); tp->t_flags &= ~TF_FORCEDATA; } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_drop(tp, ECONNABORTED); TCPDEBUG2(PRU_ABORT); } if (!(inp->inp_vflag & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_vflag |= INP_SOCKREF; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); } if (!(inp->inp_vflag & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_vflag |= INP_SOCKREF; } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCPDEBUG2(PRU_RCVOOB); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); } struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp_usr_bind, .pru_connect = tcp_usr_connect, .pru_control = in_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp6_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp6_usr_bind, .pru_connect = tcp6_usr_connect, .pru_control = in6_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct in_addr laddr; u_short lport; int error; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) return error; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) return error; if (oinp) return EADDRINUSE; inp->inp_laddr = laddr; in_pcbrehash(inp); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; } #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr *addr6; int error; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) return error; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. * in6_pcbladdr() also handles scope zone IDs. */ error = in6_pcbladdr(inp, nam, &addr6); if (error) return error; oinp = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL); if (oinp) return EADDRINUSE; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = *addr6; inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) inp->in6p_flowinfo |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) tp->request_r_scale++; soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = tp->snd_bwnd; } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ -#define INP_LOCK_RECHECK(inp) do { \ - INP_LOCK(inp); \ +#define INP_WLOCK_RECHECK(inp) do { \ + INP_WLOCK(inp); \ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { \ - INP_UNLOCK(inp); \ + INP_WUNLOCK(inp); \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } else { #endif /* INET6 */ - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); #ifdef INET6 } #endif return (error); } if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (ECONNRESET); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); - INP_LOCK_RECHECK(inp); + INP_WLOCK_RECHECK(inp); if (optval > 0) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; #endif /* TCP_SIGNATURE */ case TCP_NODELAY: case TCP_NOOPT: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); - INP_LOCK_RECHECK(inp); + INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; case TCP_NOPUSH: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); - INP_LOCK_RECHECK(inp); + INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else { tp->t_flags &= ~TF_NOPUSH; error = tcp_output(tp); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; case TCP_MAXSEG: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); - INP_LOCK_RECHECK(inp); + INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); break; case TCP_INFO: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = EINVAL; break; default: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; default: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } -#undef INP_LOCK_RECHECK +#undef INP_WLOCK_RECHECK /* * tcp_sendspace and tcp_recvspace are the default send and receive window * sizes, respectively. These are obsolescent (this information should * be set by the route). */ u_long tcp_sendspace = 1024*32; SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); u_long tcp_recvspace = 1024*64; SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, * bufer space, and entering LISTEN state if to accept connections. */ static int tcp_attach(struct socket *so) { struct tcpcb *tp; struct inpcb *inp; int error; #ifdef INET6 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; #endif if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, tcp_sendspace, tcp_recvspace); if (error) return (error); } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; INP_INFO_WLOCK(&tcbinfo); error = in_pcballoc(so, &tcbinfo); if (error) { INP_INFO_WUNLOCK(&tcbinfo); return (error); } inp = sotoinpcb(so); #ifdef INET6 if (isipv6) { inp->inp_vflag |= INP_IPV6; inp->in6p_hops = -1; /* use kernel default */ } else #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { #ifdef INET6 if (isipv6) { in6_pcbdetach(inp); in6_pcbfree(inp); } else { #endif in_pcbdetach(inp); in_pcbfree(inp); #ifdef INET6 } #endif INP_INFO_WUNLOCK(&tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return (0); } /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_vflag & INP_DROPPED)) tcp_output_disconnect(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { case TCPS_LISTEN: tcp_offload_listen_close(tp); /* FALLTHROUGH */ case TCPS_CLOSED: tp->t_state = TCPS_CLOSED; tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tp->t_state = TCPS_FIN_WAIT_1; break; case TCPS_CLOSE_WAIT: tp->t_state = TCPS_LAST_ACK; break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : tcp_maxidle; tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LQ_OVERFLOW) { db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); db_print_indent(indent); db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, &tp->t_timers->tt_delack, tp->t_inpcb); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); db_print_indent(indent); db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, tp->snd_recover); db_print_indent(indent); db_printf("t_maxopd: %u t_rcvtime: %lu t_startime: %lu\n", tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %d t_rtsq: 0x%08x t_bw_rtttime: %d\n", tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); db_print_indent(indent); db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %lu\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x " "t_badrxtwin: %lu\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 178284) +++ head/sys/netinet/udp_usrreq.c (revision 178285) @@ -1,1170 +1,1172 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #ifdef IPSEC #include #endif #include #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ static int udp_cksum = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum, 0, ""); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); int udp_blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &udp_blackhole, 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); struct inpcbhead udb; /* from udp_var.h */ struct inpcbinfo udbinfo; #ifndef UDBHASHSIZE #define UDBHASHSIZE 16 #endif struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); static void udp_zone_change(void *tag) { uma_zone_set_max(udbinfo.ipi_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } void udp_init(void) { INP_INFO_LOCK_INIT(&udbinfo, "udp"); LIST_INIT(&udb); udbinfo.ipi_listhead = &udb; udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.ipi_hashmask); udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.ipi_porthashmask); udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(udbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. */ static void udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); ipsec4stat.in_polvio++; return; } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { int savedflags; savedflags = inp->inp_flags; inp->inp_flags &= ~INP_UNMAPPABLEOPTS; ip6_savecontrol(inp, n, &opts); inp->inp_flags = savedflags; } else #endif ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); udpstat.udps_fullsock++; } else sorwakeup_locked(so); } void udp_input(struct mbuf *m, int off) { int iphlen = off; struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; int len; struct ip save_ip; struct sockaddr_in udp_in; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif ifp = m->m_pkthdr.rcvif; udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); /* ip->ip_len = len; */ } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { udpstat.udps_badsum++; m_freem(m); return; } } else udpstat.udps_nosum++; #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL) { struct sockaddr_in *next_hop; /* * Do the hack. */ next_hop = (struct sockaddr_in *)(fwd_tag + 1); ip->ip_dst = next_hop->sin_addr; uh->uh_dport = ntohs(next_hop->sin_port); /* * Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } #endif INP_INFO_RLOCK(&udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; last = NULL; LIST_FOREACH(inp, &udb, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; /* * XXX: Do not check source port of incoming datagram * unless inp_connect() has been called to bind the * fport part of the 4-tuple; the source could be * trying to talk to us with an ephemeral port. */ if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; - INP_LOCK(inp); + INP_WLOCK(inp); /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL) { struct sockaddr_in sin; struct in_msource *ims; int blocked, mode; size_t idx; bzero(&sin, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = ip->ip_dst; blocked = 0; idx = imo_match_group(imo, ifp, (struct sockaddr *)&sin); if (idx == -1) { /* * No group membership for this socket. * Do not bump udps_noportbcast, as * this will happen further down. */ blocked++; } else { /* * Check for a multicast source filter * entry on this socket for this group. * MCAST_EXCLUDE is the default * behaviour. It means default accept; * entries, if present, denote sources * to be excluded from delivery. */ ims = imo_match_source(imo, idx, (struct sockaddr *)&udp_in); mode = imo->imo_mfilters[idx].imf_fmode; if ((ims != NULL && mode == MCAST_EXCLUDE) || (ims == NULL && mode == MCAST_INCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: blocked by" " source filter\n", __func__); } #endif udpstat.udps_filtermcast++; blocked++; } } if (blocked != 0) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, M_COPYALL); if (n != NULL) udp_append(last, ip, n, iphlen + sizeof(struct udphdr), &udp_in); - INP_UNLOCK(last); + INP_WUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ udpstat.udps_noportbcast++; goto badheadlocked; } udp_append(last, ip, m, iphlen + sizeof(struct udphdr), &udp_in); - INP_UNLOCK(last); + INP_WUNLOCK(last); INP_INFO_RUNLOCK(&udbinfo); return; } /* * Locate pcb for datagram. */ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, ifp); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { udpstat.udps_noportbcast++; goto badheadlocked; } if (udp_blackhole) goto badheadlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badheadlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); INP_INFO_RUNLOCK(&udbinfo); return; } /* * Check the minimum TTL for socket. */ - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) goto badheadlocked; udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); return; badheadlocked: if (inp) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); badunlocked: m_freem(m); } /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { + INP_WLOCK_ASSERT(inp); + inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); } else in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], udp_notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&udbinfo); gencnt = udbinfo.ipi_gencnt; n = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&udbinfo); for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&udbinfo); xig.xig_gen = udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp == NULL || inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_INFO_RUNLOCK(&udbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct sockaddr_in *sin, src; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { m_freem(m); return (error); } if (src.sin_family == AF_INET || addr != NULL) { INP_INFO_WLOCK(&udbinfo); unlock_udbinfo = 1; } else unlock_udbinfo = 0; - INP_LOCK(inp); + INP_WLOCK(inp); #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } if (addr) { sin = (struct sockaddr_in *)addr; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { /* * Remember addr if jailed, to prevent rebinding. */ if (jailed(td->td_ucred)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= IP_DF; } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; /* * Set up checksum and output datagram. */ if (udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else ui->ui_sum = 0; ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; if (unlock_udbinfo) INP_INFO_WUNLOCK(&udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (error); release: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); if (unlock_udbinfo) INP_INFO_WUNLOCK(&udbinfo); m_freem(m); return (error); } static void udp_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; soisdisconnected(so); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(&udbinfo); error = in_pcballoc(so, &udbinfo); if (error) { INP_INFO_WUNLOCK(&udbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (0); } static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; soisdisconnected(so); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; struct sockaddr_in *sin; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); error = in_pcbconnect(inp, nam, td->td_ucred); if (error == 0) soisconnected(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); - INP_UNLOCK(inp); return (ENOTCONN); } in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); socantsendmore(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (0); } struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 178284) +++ head/sys/netinet6/icmp6.c (revision 178285) @@ -1,2786 +1,2786 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif extern struct domain inet6domain; struct icmp6stat icmp6stat; extern struct inpcbinfo ripcbinfo; extern struct inpcbhead ripcb; extern int icmp6errppslim; static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; static void icmp6_errcount(struct icmp6errstat *, int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag __P((struct in6_addr *, struct in6_addr *, struct in6_addr *)); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *)); static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int)); static int icmp6_notify_error(struct mbuf **, int, int, int); void icmp6_init(void) { mld6_init(); } static void icmp6_errcount(struct icmp6errstat *stat, int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: stat->icp6errs_dst_unreach_noroute++; return; case ICMP6_DST_UNREACH_ADMIN: stat->icp6errs_dst_unreach_admin++; return; case ICMP6_DST_UNREACH_BEYONDSCOPE: stat->icp6errs_dst_unreach_beyondscope++; return; case ICMP6_DST_UNREACH_ADDR: stat->icp6errs_dst_unreach_addr++; return; case ICMP6_DST_UNREACH_NOPORT: stat->icp6errs_dst_unreach_noport++; return; } break; case ICMP6_PACKET_TOO_BIG: stat->icp6errs_packet_too_big++; return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: stat->icp6errs_time_exceed_transit++; return; case ICMP6_TIME_EXCEED_REASSEMBLY: stat->icp6errs_time_exceed_reassembly++; return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: stat->icp6errs_paramprob_header++; return; case ICMP6_PARAMPROB_NEXTHEADER: stat->icp6errs_paramprob_nextheader++; return; case ICMP6_PARAMPROB_OPTION: stat->icp6errs_paramprob_option++; return; } break; case ND_REDIRECT: stat->icp6errs_redirect++; return; } stat->icp6errs_unknown++; } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; icmp6stat.icp6s_error++; /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { icmp6stat.icp6s_canterror++; goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Pakcet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ icmp6stat.icp6s_canterror++; goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { icmp6stat.icp6s_toofreq++; goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_DONTWAIT); if (m && m->m_len < preplen) m = m_pullup(m, preplen); if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m->m_pkthdr.rcvif = NULL; icmp6stat.icp6s_outhist[type]++; icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); icmp6stat.icp6s_checksum++; goto freeit; } if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: break; default: goto freeit; } } icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ break; } if ((n->m_flags & M_EXT) != 0 || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; const int maxlen = sizeof(*nip6) + sizeof(*nicmp6); int n0len; MGETHDR(n, M_DONTWAIT, n0->m_type); n0len = n0->m_pkthdr.len; /* save for use below */ if (n) M_MOVE_PKTHDR(n, n0); if (n && maxlen >= MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ m_freem(n0); break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { nip6 = mtod(n, struct ip6_hdr *); IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: if (icmp6len < sizeof(struct mld_hdr)) goto badlen; if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ mld6_input(m, off); m = NULL; goto freeit; } mld6_input(n, off); /* m stays. */ break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ case MLD_MTRACE_RESP: case MLD_MTRACE: /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; #define hostnamelen strlen(hostname) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copy(m, 0, M_COPYALL); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxlen, maxhlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((icmp6_nodeinfo & 5) != 5) break; if (code != 0) goto badcode; maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4; if (maxlen >= MCLBYTES) { /* Give up remote */ break; } MGETHDR(n, M_DONTWAIT, m->m_type); if (n && maxlen > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n && !m_dup_pkthdr(n, m, M_DONTWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; } if (n == NULL) { /* Give up remote */ break; } n->m_pkthdr.rcvif = NULL; n->m_len = 0; maxhlen = M_TRAILINGSPACE(n) - maxlen; if (maxhlen > hostnamelen) maxhlen = hostnamelen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */ noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } #undef hostnamelen if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rs_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_rs_input(n, off, icmp6len); /* m stays. */ break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ra_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ra_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); m = NULL; goto freeit; } icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code)) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: icmp6stat.icp6s_badcode++; break; badlen: icmp6stat.icp6s_badlen++; break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; bzero(&inc, sizeof(inc)); inc.inc_flags = 1; /* IPv6 */ inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); icmp6stat.icp6s_pmtuchg++; } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ #define hostnamelen strlen(hostname) static struct mbuf * ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ if ((ia6 = ip6_getdstifaddr(m)) == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ n = ni6_nametodns(hostname, hostnamelen, 0); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* allocate an mbuf to reply. */ MGETHDR(n, M_DONTWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } M_MOVE_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { goto bad; } } n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in variable "hostname"? */ n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } #undef hostnamelen /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */ MGET(m, M_DONTWAIT, MT_DATA); if (m && len > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto fail; } if (!m) goto fail; m->m_next = NULL; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { addrsofif = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; IFNET_RUNLOCK(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK(); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet); struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK(); again: for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_second) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct in6pcb *in6p; struct in6pcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK(&ripcbinfo); LIST_FOREACH(in6p, &ripcb, inp_list) { - INP_LOCK(in6p); + INP_WLOCK(in6p); if ((in6p->inp_vflag & INP_IPV6) == 0) { docontinue: - INP_UNLOCK(in6p); + INP_WUNLOCK(in6p); continue; } if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) goto docontinue; if (in6p->in6p_icmp6filt && ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) goto docontinue; if (last) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked( &last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); opts = NULL; } - INP_UNLOCK(last); + INP_WUNLOCK(last); } last = in6p; } if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); - INP_UNLOCK(last); + INP_WUNLOCK(last); } else { m_freem(m); ip6stat.ip6s_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; int plen; int type, code; struct ifnet *outif = NULL; struct in6_addr origdst, *src = NULL; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ origdst = ip6->ip6_dst; /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. * Note that ip6_getdstifaddr() may fail if we are in an error handling * procedure of an outgoing packet of our own, in which case we need * to search in the ifaddr list. */ if (!IN6_IS_ADDR_MULTICAST(&origdst)) { if ((ia = ip6_getdstifaddr(m))) { if (!(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) src = &ia->ia_addr.sin6_addr; } else { struct sockaddr_in6 d; bzero(&d, sizeof(d)); d.sin6_family = AF_INET6; d.sin6_len = sizeof(d); d.sin6_addr = origdst; ia = (struct in6_ifaddr *) ifa_ifwithaddr((struct sockaddr *)&d); if (ia && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src = &ia->ia_addr.sin6_addr; } } } if (src == NULL) { int e; struct sockaddr_in6 sin6; struct route_in6 ro; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ bzero(&ro, sizeof(ro)); src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e); if (ro.ro_rt) RTFREE(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &sin6.sin6_addr), e)); goto bad; } } ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; if (outif) ip6->ip6_hlim = ND_IFINFO(outif)->chlim; else if (m->m_pkthdr.rcvif) { /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else ip6->ip6_hlim = ip6_defhlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { return; } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; u_char *redirhdr = NULL; int redirhdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; if (!m) return; ifp = m->m_pkthdr.rcvif; if (!ifp) return; /* XXX if we are router, we don't update route by icmp6 redirect */ if (ip6_forwarding) goto freeit; if (!icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct sockaddr_in6 sin6; struct in6_addr *gw6; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, gw6), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } RTFREE_LOCKED(rt); rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* validation passed */ icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "icmp6_redirect_input: " "invalid ND option, rejected: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (ndopts.nd_opts_rh) { redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len; redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */ } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "icmp6_redirect_input: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; bzero(&sdst, sizeof(sdst)); bzero(&sgw, sizeof(sgw)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw, (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&ssrc); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&sdst); #endif /* IPSEC */ } freeit: m_freem(m); return; bad: icmp6stat.icp6s_badredirect++; m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && IPV6_MMTU >= MHLEN) MCLGET(m, M_DONTWAIT); if (!m) goto fail; m->m_pkthdr.rcvif = NULL; m->m_len = 0; maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ struct rtentry *rt_router = NULL; int len; struct sockaddr_dl *sdl; struct nd_opt_hdr *nd_opt; char *lladdr; rt_router = nd6_lookup(router_ll6, 0, ifp); if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (!(rt_router->rt_flags & RTF_GATEWAY) && (rt_router->rt_flags & RTF_LLINFO) && (rt_router->rt_gateway->sa_family == AF_LINK) && (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && sdl->sdl_alen) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); p += len; } } nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } icmp6stat.icp6s_outhist[ND_REDIRECT]++; return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter *p; if (optlen != sizeof(*p)) { error = EMSGSIZE; break; } if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, optlen); break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyout(sopt, inp->in6p_icmp6filt, sizeof(struct icmp6_filter)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c (revision 178284) +++ head/sys/netinet6/in6_pcb.c (revision 178285) @@ -1,910 +1,910 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include struct in6_addr zeroin6_addr; int in6_pcbbind(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); INP_INFO_WLOCK_ASSERT(pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (!in6_ifaddr) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = INPLOOKUP_WILDCARD; if (nam) { int error; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * family check. */ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return(error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ia = NULL; sin6->sin6_port = 0; /* yech... */ if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) return (EADDRNOTAVAIL); /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { return (EADDRNOTAVAIL); } } if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) <= ipport_reservedhigh && ntohs(lport) >= ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(so->so_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD); if (t && ((t->inp_vflag & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && (so->so_cred->cr_uid != t->inp_socket->so_cred->cr_uid)) return (EADDRINUSE); if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD); if (t && ((t->inp_vflag & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (so->so_cred->cr_uid != t->inp_socket->so_cred->cr_uid)) return (EADDRINUSE); } } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, wild); if (t && (reuseport & ((t->inp_vflag & INP_TIMEWAIT) ? intotw(t)->tw_so_options : t->inp_socket->so_options)) == 0) return (EADDRINUSE); if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, wild); if (t && t->inp_vflag & INP_TIMEWAIT) { if ((reuseport & intotw(t)->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || ((inp->inp_vflag & INP_IPV6PROTO) == (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & t->inp_socket->so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket))) return (EADDRINUSE); } } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { int e; if ((e = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) return (e); } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ int in6_pcbladdr(register struct inpcb *inp, struct sockaddr *nam, struct in6_addr **plocal_addr6) { register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; struct ifnet *ifp = NULL; int scope_ambiguous = 0; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return(error); if (in6_ifaddr) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } /* * XXX: in6_selectsrc might replace the bound local address * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, inp->in6p_moptions, NULL, &inp->in6p_laddr, &ifp, &error); if (ifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) { return(error); } if (*plocal_addr6 == 0) { if (error == 0) error = EADDRNOTAVAIL; return (error); } /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in6_pcbconnect(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct in6_addr *addr6; register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); if (in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, cred); if (error) return (error); } inp->in6p_laddr = *addr6; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) inp->in6p_flowinfo |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); return (0); } void in6_pcbdisconnect(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } void in6_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("in6_pcbdetach: inp_socket == NULL")); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } void in6_pcbfree(struct inpcb *inp) { struct inpcbinfo *ipi = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("in6_pcbfree: inp_socket != NULL")); INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef IPSEC if (inp->in6p_sp != NULL) ipsec6_delete_pcbpolicy(inp); #endif /* IPSEC */ inp->inp_gencnt = ++ipi->ipi_gencnt; in_pcbremlists(inp); ip6_freepcbopts(inp->in6p_outputopts); ip6_freemoptions(inp->in6p_moptions); /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; #ifdef MAC mac_inpcb_destroy(inp); #endif - INP_UNLOCK(inp); + INP_WUNLOCK(inp); uma_zfree(ipi->ipi_zone, inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; MALLOC(sin6_p, struct sockaddr_in6 *, sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { register struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; int errno; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return; sa6_dst = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6_rtchange to invalidate the route cache. * Dead host indications: also use in6_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. Note that we notify for all * disconnected sockets if the corresponding application * wanted. This is because some UDP applications keep sending * sockets disconnected. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) { ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, (u_int32_t *)cmdarg); } /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->in6p_flowinfo & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); continue; } do_notify: if (notify) { if ((*notify)(inp, errno)) - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } else - INP_UNLOCK(inp); + INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_int lport_arg, int wild_okay) { register struct inpcb *inp; int matchwild = 3, wildcard; u_short lport = lport_arg; INP_INFO_WLOCK_ASSERT(pcbinfo); if (!wild_okay) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* * Found. */ return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) { break; } } } } return (match); } } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct in6pcb *in6p; struct ip6_moptions *im6o; struct in6_multi_mship *imm, *nimm; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { - INP_LOCK(in6p); + INP_WLOCK(in6p); im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o) { /* * Unselect the outgoing interface if it is being * detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * XXX controversial - is it really legal for kernel * to force this? */ for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = nimm) { nimm = imm->i6mm_chain.le_next; if (imm->i6mm_maddr->in6m_ifp == ifp) { LIST_REMOVE(imm, i6mm_chain); in6_delmulti(imm->i6mm_maddr); free(imm, M_IP6MADDR); } } } - INP_UNLOCK(in6p); + INP_WUNLOCK(in6p); } INP_INFO_RUNLOCK(pcbinfo); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *in6p) { /* * We don't store route pointers in the routing table anymore */ return; } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno) { /* * We don't store route pointers in the routing table anymore */ return inp; } /* * Lookup PCB in hash list. */ struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int wildcard, struct ifnet *ifp) { struct inpcbhead *head; register struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; int faith; - INP_INFO_RLOCK_ASSERT(pcbinfo); + INP_INFO_LOCK_ASSERT(pcbinfo); if (faithprefix_p != NULL) faith = (*faithprefix_p)(laddr); else faith = 0; /* * First look for an exact match. */ head = &pcbinfo->ipi_hashbase[ INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * Found. */ return (inp); } } if (wildcard) { struct inpcb *local_wild = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && inp->inp_lport == lport) { if (faith && (inp->inp_flags & INP_FAITH) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) return (inp); else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) local_wild = inp; } } return (local_wild); } /* * Not found. */ return (NULL); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } Index: head/sys/netinet6/in6_src.c =================================================================== --- head/sys/netinet6/in6_src.c (revision 178284) +++ head/sys/netinet6/in6_src.c (revision 178285) @@ -1,1098 +1,1098 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include static struct mtx addrsel_lock; #define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF) #define ADDRSEL_LOCK() mtx_lock(&addrsel_lock) #define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock) #define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED) static struct sx addrsel_sxlock; #define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock") #define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock) #define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock) #define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock) #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) struct in6_addrpolicy defaultaddrpolicy; int ip6_prefer_tempaddr = 0; static int selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct rtentry **, int, int)); static int in6_selectif __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *ro, struct ifnet **)); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy __P((int (*)(struct in6_addrpolicy *, void *), void *)); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #define REPLACE(r) do {\ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto replace; \ } while(0) #define NEXT(r) do {\ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) struct in6_addr * in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct in6_addr *laddr, struct ifnet **ifpp, int *errorp) { struct in6_addr dst; struct ifnet *ifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; u_int32_t odstzone; int prefer_tempaddr; dst = dstsock->sin6_addr; /* make a copy for local operation */ *errorp = 0; if (ifpp) *ifpp = NULL; /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { struct sockaddr_in6 srcsock; struct in6_ifaddr *ia6; /* get the outgoing interface */ if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0) { return (NULL); } /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ bzero(&srcsock, sizeof(srcsock)); srcsock.sin6_family = AF_INET6; srcsock.sin6_len = sizeof(srcsock); srcsock.sin6_addr = pi->ipi6_addr; if (ifp) { *errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL); if (*errorp != 0) return (NULL); } ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock)); if (ia6 == NULL || (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { *errorp = EADDRNOTAVAIL; return (NULL); } pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; return (&ia6->ia_addr.sin6_addr); } /* * Otherwise, if the socket has already bound the source, just use it. */ if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) return (laddr); /* * If the address is not specified, choose the best one based on * the outgoing interface and the destination address. */ /* get the outgoing interface */ if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0) return (NULL); #ifdef DIAGNOSTIC if (ifp == NULL) /* this should not happen */ panic("in6_selectsrc: NULL ifp"); #endif *errorp = in6_setscope(&dst, ifp, &odstzone); if (*errorp != 0) return (NULL); for (ia = in6_ifaddr; ia; ia = ia->ia_next) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(&dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(0); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(&dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. */ if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 * case above. */ /* Rule 5: Prefer outgoing interface */ if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst)); next: continue; out: break; } if ((ia = ia_best) == NULL) { *errorp = EADDRNOTAVAIL; return (NULL); } if (ifpp) *ifpp = ifp; return (&ia->ia_addr.sin6_addr); } /* * clone - meaningful only for bsdi and freebsd */ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone, int norouteok) { int error = 0; struct ifnet *ifp = NULL; struct rtentry *rt = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; #if 0 char ip6buf[INET6_ADDRSTRLEN]; if (dstsock->sin6_addr.s6_addr32[0] == 0 && dstsock->sin6_addr.s6_addr32[1] == 0 && !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { printf("in6_selectroute: strange destination %s\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr)); } else { printf("in6_selectroute: destination = %s%%%d\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr), dstsock->sin6_scope_id); /* for debug */ } #endif /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for * multicast. */ goto done; } else goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { goto done; /* we do not need a route for multicast. */ } getroute: /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route_in6 *ron; sin6_next = satosin6(opts->ip6po_nexthop); /* at this moment, we only support AF_INET6 next hops */ if (sin6_next->sin6_family != AF_INET6) { error = EAFNOSUPPORT; /* or should we proceed? */ goto done; } /* * If the next hop is an IPv6 address, then the node identified * by that address must be a neighbor of the sending host. */ ron = &opts->ip6po_nextroute; if ((ron->ro_rt && (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != (RTF_UP | RTF_LLINFO)) || !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, &sin6_next->sin6_addr)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } *satosin6(&ron->ro_dst) = *sin6_next; } if (ron->ro_rt == NULL) { rtalloc((struct route *)ron); /* multi path case? */ if (ron->ro_rt == NULL || !(ron->ro_rt->rt_flags & RTF_LLINFO)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } error = EHOSTUNREACH; goto done; } } rt = ron->ro_rt; ifp = rt->rt_ifp; /* * When cloning is required, try to allocate a route to the * destination so that the caller can store path MTU * information. */ if (!clone) goto done; } /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. */ if (ro) { if (ro->ro_rt && (!(ro->ro_rt->rt_flags & RTF_UP) || ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } if (ro->ro_rt == (struct rtentry *)NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); sa6 = (struct sockaddr_in6 *)&ro->ro_dst; *sa6 = *dstsock; sa6->sin6_scope_id = 0; if (clone) { #ifdef RADIX_MPATH rtalloc_mpath((struct route *)ro, ntohl(sa6->sin6_addr.s6_addr32[3])); #else rtalloc((struct route *)ro); #endif } else { ro->ro_rt = rtalloc1(&((struct route *)ro) ->ro_dst, 0, 0UL); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } } /* * do not care about the result if we have the nexthop * explicitly specified. */ if (opts && opts->ip6po_nexthop) goto done; if (ro->ro_rt) { ifp = ro->ro_rt->rt_ifp; if (ifp == NULL) { /* can this really happen? */ RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } if (ro->ro_rt == NULL) error = EHOSTUNREACH; rt = ro->ro_rt; /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (ifp && opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) { error = EHOSTUNREACH; goto done; } } } done: if (ifp == NULL && rt == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. */ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) ip6stat.ip6s_noroute++; if (retifp != NULL) *retifp = ifp; if (retrt != NULL) *retrt = rt; /* rt may be NULL */ return (error); } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp) { int error; struct route_in6 sro; struct rtentry *rt = NULL; if (ro == NULL) { bzero(&sro, sizeof(sro)); ro = &sro; } if ((error = selectroute(dstsock, opts, mopts, ro, retifp, &rt, 0, 1)) != 0) { if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (error); } /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (flags); } /* * Adjust the "outgoing" interface. If we're going to loop the packet * back to ourselves, the ifp would be the loopback interface. * However, we'd rather know the interface associated to the * destination address (which should probably be one of our own * addresses.) */ if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) *retifp = rt->rt_ifa->ifa_ifp; if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (0); } /* * clone - meaningful only for bsdi and freebsd */ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone) { return (selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone, 0)); } /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6_selecthlim(struct in6pcb *in6p, struct ifnet *ifp) { if (in6p && in6p->in6p_hops >= 0) return (in6p->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { struct route_in6 ro6; struct ifnet *lifp; bzero(&ro6, sizeof(ro6)); ro6.ro_dst.sin6_family = AF_INET6; ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); ro6.ro_dst.sin6_addr = in6p->in6p_faddr; rtalloc((struct route *)&ro6); if (ro6.ro_rt) { lifp = ro6.ro_rt->rt_ifp; RTFREE(ro6.ro_rt); if (lifp) return (ND_IFINFO(lifp)->chlim); } else return (ip6_defhlim); } return (ip6_defhlim); } /* * XXX: this is borrowed from in6_pcbbind(). If possible, we should * share this function by all *bsd*... */ int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; if (inp->inp_flags & INP_HIGHPORT) { first = ipport_hifirstauto; /* sysctl */ last = ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; first = ipport_lowfirstauto; /* 1023 */ last = ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = ipport_firstauto; /* sysctl */ last = ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * Simple check to ensure all ports are not used up causing * a deadlock here. * * We split the two cases (up and down) so that the direction * is not being tested on each round of the loop. */ if (first > last) { /* * counting down */ count = first - last; do { if (count-- < 0) { /* completely used? */ /* * Undo any address bind that may have * occurred above. */ inp->in6p_laddr = in6addr_any; return (EAGAIN); } --*lastport; if (*lastport > first || *lastport < last) *lastport = first; lport = htons(*lastport); } while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, wild)); } else { /* * counting up */ count = last - first; do { if (count-- < 0) { /* completely used? */ /* * Undo any address bind that may have * occurred above. */ inp->in6p_laddr = in6addr_any; return (EAGAIN); } ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, wild)); } inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } return (0); } void addrsel_policy_init(void) { ADDRSEL_LOCK_INIT(); ADDRSEL_SXLOCK_INIT(); init_policy_queue(); /* initialize the "last resort" policy */ bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy)); defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) match = &defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct walkarg { struct sysctl_req *w_req; }; static int in6_src_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, CTLFLAG_RD, in6_src_sysctl, ""); static int in6_src_sysctl(SYSCTL_HANDLER_ARGS) { struct walkarg w; if (req->newptr) return EPERM; bzero(&w, sizeof(w)); w.w_req = req; return (walk_addrsel_policy(dump_addrsel_policyent, &w)); } int in6_src_ioctl(u_long cmd, caddr_t data) { int i; struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ for (i = 0; i < 4; i++) { ent0.addr.sin6_addr.s6_addr32[i] &= ent0.addrmask.sin6_addr.s6_addr32[i]; } ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); struct addrsel_policyhead addrsel_policytab; static void init_policy_queue(void) { TAILQ_INIT(&addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { struct addrsel_policyent *new, *pol; MALLOC(new, struct addrsel_policyent *, sizeof(*new), M_IFADDR, M_WAITOK); ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* duplication check */ TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); FREE(new, M_IFADDR); return (EEXIST); /* or override it? */ } } bzero(new, sizeof(*new)); /* XXX: should validate entry */ new->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (ESRCH); } TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); } } ADDRSEL_SUNLOCK(); return (error); } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct walkarg *w = arg; error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); return (error); } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; TAILQ_FOREACH(pent, &addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 178284) +++ head/sys/netinet6/raw_ip6.c (revision 178285) @@ -1,809 +1,809 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_long rip_sendspace; extern u_long rip_recvspace; struct rip6stat rip6stat; /* * Hooks for multicast forwarding. */ struct socket *ip6_mrouter = NULL; int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(int, caddr_t); /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); register struct inpcb *in6p; struct inpcb *last = 0; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; rip6stat.rip6s_ipackets++; if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* XXX send icmp6 host/port unreach? */ m_freem(m); return IPPROTO_DONE; } init_sin6(&fromsa, m); /* general init */ INP_INFO_RLOCK(&ripcbinfo); LIST_FOREACH(in6p, &ripcb, inp_list) { - INP_LOCK(in6p); + INP_WLOCK(in6p); if ((in6p->in6p_vflag & INP_IPV6) == 0) { docontinue: - INP_UNLOCK(in6p); + INP_WUNLOCK(in6p); continue; } if (in6p->in6p_ip6_nxt && in6p->in6p_ip6_nxt != proto) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) goto docontinue; if (in6p->in6p_cksum != -1) { rip6stat.rip6s_isum++; if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { rip6stat.rip6s_badsum++; goto docontinue; } } if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); #ifdef IPSEC /* * Check AH/ESP integrity. */ if (n && ipsec6_in_reject(n, last)) { m_freem(n); ipsec6stat.in_polvio++; /* do not inject data into pcb */ } else #endif /* IPSEC */ if (n) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); opts = NULL; } - INP_UNLOCK(last); + INP_WUNLOCK(last); } last = in6p; } #ifdef IPSEC /* * Check AH/ESP integrity. */ if (last && ipsec6_in_reject(m, last)) { m_freem(m); ipsec6stat.in_polvio++; ip6stat.ip6s_delivered--; /* do not inject data into pcb */ - INP_UNLOCK(last); + INP_WUNLOCK(last); } else #endif /* IPSEC */ if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); - INP_UNLOCK(last); + INP_WUNLOCK(last); } else { rip6stat.rip6s_nosock++; if (m->m_flags & M_MCAST) rip6stat.rip6s_nosockmcast++; if (proto == IPPROTO_NONE) m_freem(m); else { char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, prvnxtp - mtod(m, char *)); } ip6stat.ip6s_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); return IPPROTO_DONE; } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. * Tack on options user may have setup with control call. */ int #if __STDC__ rip6_output(struct mbuf *m, ...) #else rip6_output(m, va_alist) struct mbuf *m; va_dcl #endif { struct mbuf *control; struct socket *so; struct sockaddr_in6 *dstsock; struct in6_addr *dst; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; struct in6_addr *in6a; va_list ap; va_start(ap, m); so = va_arg(ap, struct socket *); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoin6pcb(so); - INP_LOCK(in6p); + INP_WLOCK(in6p); dst = &dstsock->sin6_addr; if (control) { if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code * to update statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ if ((in6a = in6_selectsrc(dstsock, optp, in6p->in6p_moptions, NULL, &in6p->in6p_laddr, &oifp, &error)) == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto bad; } ip6->ip6_src = *in6a; if (oifp && scope_ambiguous) { /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined * (when it's required), if we can determine the outgoing * interface. determine the zone ID based on the interface. */ error = in6_setscope(&dstsock->sin6_addr, oifp, NULL); if (error != 0) goto bad; } ip6->ip6_dst = dstsock->sin6_addr; /* fill in the rest of the IPv6 header fields */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->in6p_ip6_nxt; ip6->ip6_hlim = in6_selecthlim(in6p, oifp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* compute checksum */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); icmp6stat.icp6s_outhist[type]++; } else rip6stat.rip6s_opackets++; goto freectl; bad: if (m) m_freem(m); freectl: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } - INP_UNLOCK(in6p); + INP_WUNLOCK(in6p); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return error; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; MALLOC(filter, struct icmp6_filter *, sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return ENOMEM; INP_INFO_WLOCK(&ripcbinfo); error = in_pcballoc(so, &ripcbinfo); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); FREE(filter, M_PCB); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->in6p_ip6_nxt = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->in6p_icmp6filt) { FREE(inp->in6p_icmp6filt, M_PCB); inp->in6p_icmp6filt = NULL; } in6_pcbdetach(inp); in6_pcbfree(inp); INP_INFO_WUNLOCK(&ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ia = NULL; int error = 0; KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6) return EADDRNOTAVAIL; if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) return(error); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) return EADDRNOTAVAIL; if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { return (EADDRNOTAVAIL); } INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr *in6a = NULL; struct ifnet *ifp = NULL; int error = 0, scope_ambiguous = 0; KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if (addr->sin6_family != AF_INET6) return EAFNOSUPPORT; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (addr->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) return(error); INP_INFO_WLOCK(&ripcbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp->in6p_moptions, NULL, &inp->in6p_laddr, &ifp, &error); if (in6a == NULL) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return (error ? error : EADDRNOTAVAIL); } /* XXX: see above */ if (ifp && scope_ambiguous && (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) { - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return(error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = *in6a; soisconnected(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); - INP_LOCK(inp); + INP_WLOCK(inp); socantsendmore(so); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return 0; } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; KASSERT(inp != NULL, ("rip6_send: inp == NULL")); INP_INFO_WLOCK(&ripcbinfo); /* always copy sockaddr to avoid overwrites */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return EISCONN; } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); dst = &tmp; } else { if (nam == NULL) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return ENOTCONN; } if (nam->sa_len != sizeof(struct sockaddr_in6)) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return(EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); INP_INFO_WUNLOCK(&ripcbinfo); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c (revision 178284) +++ head/sys/netinet6/udp6_usrreq.c (revision 178285) @@ -1,1019 +1,1019 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct protosw inetsw[]; static void udp6_detach(struct socket *so); static void udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec6_in_reject(n, inp)) { m_freem(n); ipsec6stat.in_polvio++; return; } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif opts = NULL; if (inp->in6p_flags & IN6P_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); udpstat.udps_fullsock++; } else sorwakeup_locked(so); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; int off = *offp; int plen, ulen; struct sockaddr_in6 fromsa; ip6 = mtod(m, struct ip6_hdr *); if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* XXX send icmp6 host/port unreach? */ m_freem(m); return (IPPROTO_DONE); } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); if (!uh) return (IPPROTO_DONE); #endif udpstat.udps_ipackets++; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); if (plen != ulen) { udpstat.udps_badlen++; goto badunlocked; } /* * Checksum extended UDP header and data. */ if (uh->uh_sum == 0) { udpstat.udps_nosum++; goto badunlocked; } if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { udpstat.udps_badsum++; goto badunlocked; } /* * Construct sockaddr format source address. */ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; INP_INFO_RLOCK(&udbinfo); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ last = NULL; LIST_FOREACH(inp, &udb, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->in6p_lport != uh->uh_dport) continue; /* * XXX: Do not check source port of incoming datagram * unless inp_connect() has been called to bind the * fport part of the 4-tuple; the source could be * trying to talk to us with an ephemeral port. */ if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) || inp->in6p_fport != uh->uh_sport) continue; } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { - INP_LOCK(last); + INP_WLOCK(last); udp6_append(last, n, off, &fromsa); - INP_UNLOCK(last); + INP_WUNLOCK(last); } } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ udpstat.udps_noport++; udpstat.udps_noportmcast++; goto badheadlocked; } - INP_LOCK(last); + INP_WLOCK(last); udp6_append(last, m, off, &fromsa); - INP_UNLOCK(last); + INP_WUNLOCK(last); INP_INFO_RUNLOCK(&udbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ inp = in6_pcblookup_hash(&udbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } udpstat.udps_noport++; if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); udpstat.udps_noportmcast++; goto badheadlocked; } INP_INFO_RUNLOCK(&udbinfo); if (udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } - INP_LOCK(inp); + INP_WLOCK(inp); udp6_append(inp, m, off, &fromsa); - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); return (IPPROTO_DONE); badheadlocked: INP_INFO_RUNLOCK(&udbinfo); badunlocked: if (m) m_freem(m); return (IPPROTO_DONE); } void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } if (ip6) { /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); (void) in6_pcbnotify(&udbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else (void) in6_pcbnotify(&udbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { return (error); } INP_INFO_RLOCK(&udbinfo); inp = in6_pcblookup_hash(&udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 1, NULL); if (inp == NULL) { INP_INFO_RUNLOCK(&udbinfo); return (ENOENT); } - INP_LOCK(inp); + INP_WLOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr; struct sockaddr_in6 *sin6 = NULL; struct ifnet *oifp = NULL; int scope_ambiguous = 0; u_short fport; int error = 0; struct ip6_pktopts *optp, opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (addr6) { /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return (error); } if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, IPPROTO_UDP)) != 0) goto release; optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { faddr = &sin6->sin6_addr; /* * IPv4 version of udp_output calls in_pcbconnect in this case, * which needs splnet and affects performance. * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->in6p_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { laddr = in6_selectsrc(sin6, optp, inp->in6p_moptions, NULL, &inp->in6p_laddr, &oifp, &error); if (oifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, oifp, NULL))) { goto release; } } else laddr = &inp->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->in6p_lport == 0 && (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) goto release; } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { if ((inp->in6p_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->in6p_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT); if (m == 0) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->in6p_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->in6p_flowinfo & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; #if 0 /* ip6_plen will be filled in ip6_output. */ ip6->ip6_plen = htons((u_short)plen); #endif ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr), plen)) == 0) { udp6->uh_sum = 0xffff; } flags = 0; udpstat.udps_opackets++; error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, NULL, inp); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; (*pru->pru_abort)(so); return; } #endif INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; soisdisconnected(so); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } INP_INFO_WLOCK(&udbinfo); error = in_pcballoc(so, &udbinfo); if (error) { INP_INFO_WUNLOCK(&udbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; - INP_UNLOCK(inp); + INP_WUNLOCK(inp); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } } error = in6_pcbbind(inp, nam, td->td_ucred); out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; (*pru->pru_disconnect)(so); return; } #endif INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; soisdisconnected(so); } - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6_p); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); if (error == 0) { inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; soisconnected(so); } goto out; } } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } error = in6_pcbconnect(inp, nam, td->td_ucred); if (error == 0) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { /* should be non mapped addr */ inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; } soisconnected(so); } out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); in6_pcbdetach(inp); in6_pcbfree(inp); INP_INFO_WUNLOCK(&udbinfo); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; error = (*pru->pru_disconnect)(so); goto out; } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto out; } in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; /* XXXRW: so_state locking? */ so->so_state &= ~SS_ISCONNECTED; /* XXX */ out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (0); } static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_INFO_WLOCK(&udbinfo); - INP_LOCK(inp); + INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = 0; if (addr == 0) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * When remote addr is IPv4-mapped address, * local addr should not be an IPv6 address; * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto out; } if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; error = ((*pru->pru_send)(so, flags, m, addr, control, td)); /* addr will just be freed in sendit(). */ goto out; } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = udp6_output(inp, m, addr, control, td); out: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return (error); bad: - INP_UNLOCK(inp); + INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); m_freem(m); return (error); } struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, .pru_bind = udp6_bind, .pru_connect = udp6_connect, .pru_control = in6_control, .pru_detach = udp6_detach, .pru_disconnect = udp6_disconnect, .pru_peeraddr = in6_mapped_peeraddr, .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close }; Index: head/sys/security/audit/audit_arg.c =================================================================== --- head/sys/security/audit/audit_arg.c (revision 178284) +++ head/sys/security/audit/audit_arg.c (revision 178285) @@ -1,857 +1,857 @@ /* * Copyright (c) 1999-2005 Apple Computer, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Calls to manipulate elements of the audit record structure from system * call code. Macro wrappers will prevent this functions from being entered * if auditing is disabled, avoiding the function call cost. We check the * thread audit record pointer anyway, as the audit condition could change, * and pre-selection may not have allocated an audit record for this event. * * XXXAUDIT: Should we assert, in each case, that this field of the record * hasn't already been filled in? */ void audit_arg_addr(void *addr) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_addr = addr; ARG_SET_VALID(ar, ARG_ADDR); } void audit_arg_exit(int status, int retval) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_exitstatus = status; ar->k_ar.ar_arg_exitretval = retval; ARG_SET_VALID(ar, ARG_EXIT); } void audit_arg_len(int len) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_len = len; ARG_SET_VALID(ar, ARG_LEN); } void audit_arg_fd(int fd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_fd = fd; ARG_SET_VALID(ar, ARG_FD); } void audit_arg_fflags(int fflags) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_fflags = fflags; ARG_SET_VALID(ar, ARG_FFLAGS); } void audit_arg_gid(gid_t gid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_gid = gid; ARG_SET_VALID(ar, ARG_GID); } void audit_arg_uid(uid_t uid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_uid = uid; ARG_SET_VALID(ar, ARG_UID); } void audit_arg_egid(gid_t egid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_egid = egid; ARG_SET_VALID(ar, ARG_EGID); } void audit_arg_euid(uid_t euid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_euid = euid; ARG_SET_VALID(ar, ARG_EUID); } void audit_arg_rgid(gid_t rgid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_rgid = rgid; ARG_SET_VALID(ar, ARG_RGID); } void audit_arg_ruid(uid_t ruid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_ruid = ruid; ARG_SET_VALID(ar, ARG_RUID); } void audit_arg_sgid(gid_t sgid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_sgid = sgid; ARG_SET_VALID(ar, ARG_SGID); } void audit_arg_suid(uid_t suid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_suid = suid; ARG_SET_VALID(ar, ARG_SUID); } void audit_arg_groupset(gid_t *gidset, u_int gidset_size) { int i; struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; for (i = 0; i < gidset_size; i++) ar->k_ar.ar_arg_groups.gidset[i] = gidset[i]; ar->k_ar.ar_arg_groups.gidset_size = gidset_size; ARG_SET_VALID(ar, ARG_GROUPSET); } void audit_arg_login(char *login) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; strlcpy(ar->k_ar.ar_arg_login, login, MAXLOGNAME); ARG_SET_VALID(ar, ARG_LOGIN); } void audit_arg_ctlname(int *name, int namelen) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy(name, &ar->k_ar.ar_arg_ctlname, namelen * sizeof(int)); ar->k_ar.ar_arg_len = namelen; ARG_SET_VALID(ar, ARG_CTLNAME | ARG_LEN); } void audit_arg_mask(int mask) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_mask = mask; ARG_SET_VALID(ar, ARG_MASK); } void audit_arg_mode(mode_t mode) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_mode = mode; ARG_SET_VALID(ar, ARG_MODE); } void audit_arg_dev(int dev) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_dev = dev; ARG_SET_VALID(ar, ARG_DEV); } void audit_arg_value(long value) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_value = value; ARG_SET_VALID(ar, ARG_VALUE); } void audit_arg_owner(uid_t uid, gid_t gid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_uid = uid; ar->k_ar.ar_arg_gid = gid; ARG_SET_VALID(ar, ARG_UID | ARG_GID); } void audit_arg_pid(pid_t pid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_pid = pid; ARG_SET_VALID(ar, ARG_PID); } void audit_arg_process(struct proc *p) { struct kaudit_record *ar; KASSERT(p != NULL, ("audit_arg_process: p == NULL")); PROC_LOCK_ASSERT(p, MA_OWNED); ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = p->p_ucred->cr_audit.ai_auid; ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid; ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0]; ar->k_ar.ar_arg_ruid = p->p_ucred->cr_ruid; ar->k_ar.ar_arg_rgid = p->p_ucred->cr_rgid; ar->k_ar.ar_arg_asid = p->p_ucred->cr_audit.ai_asid; ar->k_ar.ar_arg_termid_addr = p->p_ucred->cr_audit.ai_termid; ar->k_ar.ar_arg_pid = p->p_pid; ARG_SET_VALID(ar, ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID | ARG_RGID | ARG_ASID | ARG_TERMID_ADDR | ARG_PID | ARG_PROCESS); } void audit_arg_signum(u_int signum) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_signum = signum; ARG_SET_VALID(ar, ARG_SIGNUM); } void audit_arg_socket(int sodomain, int sotype, int soprotocol) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_sockinfo.so_domain = sodomain; ar->k_ar.ar_arg_sockinfo.so_type = sotype; ar->k_ar.ar_arg_sockinfo.so_protocol = soprotocol; ARG_SET_VALID(ar, ARG_SOCKINFO); } void audit_arg_sockaddr(struct thread *td, struct sockaddr *sa) { struct kaudit_record *ar; KASSERT(td != NULL, ("audit_arg_sockaddr: td == NULL")); KASSERT(sa != NULL, ("audit_arg_sockaddr: sa == NULL")); ar = currecord(); if (ar == NULL) return; bcopy(sa, &ar->k_ar.ar_arg_sockaddr, sa->sa_len); switch (sa->sa_family) { case AF_INET: ARG_SET_VALID(ar, ARG_SADDRINET); break; case AF_INET6: ARG_SET_VALID(ar, ARG_SADDRINET6); break; case AF_UNIX: audit_arg_upath(td, ((struct sockaddr_un *)sa)->sun_path, ARG_UPATH1); ARG_SET_VALID(ar, ARG_SADDRUNIX); break; /* XXXAUDIT: default:? */ } } void audit_arg_auid(uid_t auid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = auid; ARG_SET_VALID(ar, ARG_AUID); } void audit_arg_auditinfo(struct auditinfo *au_info) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = au_info->ai_auid; ar->k_ar.ar_arg_asid = au_info->ai_asid; ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success; ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure; ar->k_ar.ar_arg_termid.port = au_info->ai_termid.port; ar->k_ar.ar_arg_termid.machine = au_info->ai_termid.machine; ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID); } void audit_arg_auditinfo_addr(struct auditinfo_addr *au_info) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = au_info->ai_auid; ar->k_ar.ar_arg_asid = au_info->ai_asid; ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success; ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure; ar->k_ar.ar_arg_termid_addr.at_type = au_info->ai_termid.at_type; ar->k_ar.ar_arg_termid_addr.at_port = au_info->ai_termid.at_port; ar->k_ar.ar_arg_termid_addr.at_addr[0] = au_info->ai_termid.at_addr[0]; ar->k_ar.ar_arg_termid_addr.at_addr[1] = au_info->ai_termid.at_addr[1]; ar->k_ar.ar_arg_termid_addr.at_addr[2] = au_info->ai_termid.at_addr[2]; ar->k_ar.ar_arg_termid_addr.at_addr[3] = au_info->ai_termid.at_addr[3]; ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID_ADDR); } void audit_arg_text(char *text) { struct kaudit_record *ar; KASSERT(text != NULL, ("audit_arg_text: text == NULL")); ar = currecord(); if (ar == NULL) return; /* Invalidate the text string */ ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_TEXT); if (ar->k_ar.ar_arg_text == NULL) ar->k_ar.ar_arg_text = malloc(MAXPATHLEN, M_AUDITTEXT, M_WAITOK); strncpy(ar->k_ar.ar_arg_text, text, MAXPATHLEN); ARG_SET_VALID(ar, ARG_TEXT); } void audit_arg_cmd(int cmd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_cmd = cmd; ARG_SET_VALID(ar, ARG_CMD); } void audit_arg_svipc_cmd(int cmd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_cmd = cmd; ARG_SET_VALID(ar, ARG_SVIPC_CMD); } void audit_arg_svipc_perm(struct ipc_perm *perm) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy(perm, &ar->k_ar.ar_arg_svipc_perm, sizeof(ar->k_ar.ar_arg_svipc_perm)); ARG_SET_VALID(ar, ARG_SVIPC_PERM); } void audit_arg_svipc_id(int id) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_id = id; ARG_SET_VALID(ar, ARG_SVIPC_ID); } void audit_arg_svipc_addr(void * addr) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_addr = addr; ARG_SET_VALID(ar, ARG_SVIPC_ADDR); } void audit_arg_posix_ipc_perm(uid_t uid, gid_t gid, mode_t mode) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_pipc_perm.pipc_uid = uid; ar->k_ar.ar_arg_pipc_perm.pipc_gid = gid; ar->k_ar.ar_arg_pipc_perm.pipc_mode = mode; ARG_SET_VALID(ar, ARG_POSIX_IPC_PERM); } void audit_arg_auditon(union auditon_udata *udata) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy((void *)udata, &ar->k_ar.ar_arg_auditon, sizeof(ar->k_ar.ar_arg_auditon)); ARG_SET_VALID(ar, ARG_AUDITON); } /* * Audit information about a file, either the file's vnode info, or its * socket address info. */ void audit_arg_file(struct proc *p, struct file *fp) { struct kaudit_record *ar; struct socket *so; struct inpcb *pcb; struct vnode *vp; int vfslocked; ar = currecord(); if (ar == NULL) return; switch (fp->f_type) { case DTYPE_VNODE: case DTYPE_FIFO: /* * XXXAUDIT: Only possibly to record as first vnode? */ vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); audit_arg_vnode(vp, ARG_VNODE1); VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); break; case DTYPE_SOCKET: so = (struct socket *)fp->f_data; if (INP_CHECK_SOCKAF(so, PF_INET)) { SOCK_LOCK(so); ar->k_ar.ar_arg_sockinfo.so_type = so->so_type; ar->k_ar.ar_arg_sockinfo.so_domain = INP_SOCKAF(so); ar->k_ar.ar_arg_sockinfo.so_protocol = so->so_proto->pr_protocol; SOCK_UNLOCK(so); pcb = (struct inpcb *)so->so_pcb; - INP_LOCK(pcb); + INP_WLOCK(pcb); ar->k_ar.ar_arg_sockinfo.so_raddr = pcb->inp_faddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_laddr = pcb->inp_laddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_rport = pcb->inp_fport; ar->k_ar.ar_arg_sockinfo.so_lport = pcb->inp_lport; - INP_UNLOCK(pcb); + INP_WUNLOCK(pcb); ARG_SET_VALID(ar, ARG_SOCKINFO); } break; default: /* XXXAUDIT: else? */ break; } } /* * Store a path as given by the user process for auditing into the audit * record stored on the user thread. This function will allocate the memory * to store the path info if not already available. This memory will be freed * when the audit record is freed. * * XXXAUDIT: Possibly assert that the memory isn't already allocated? */ void audit_arg_upath(struct thread *td, char *upath, u_int64_t flag) { struct kaudit_record *ar; char **pathp; KASSERT(td != NULL, ("audit_arg_upath: td == NULL")); KASSERT(upath != NULL, ("audit_arg_upath: upath == NULL")); ar = currecord(); if (ar == NULL) return; KASSERT((flag == ARG_UPATH1) || (flag == ARG_UPATH2), ("audit_arg_upath: flag %llu", (unsigned long long)flag)); KASSERT((flag != ARG_UPATH1) || (flag != ARG_UPATH2), ("audit_arg_upath: flag %llu", (unsigned long long)flag)); if (flag == ARG_UPATH1) pathp = &ar->k_ar.ar_arg_upath1; else pathp = &ar->k_ar.ar_arg_upath2; if (*pathp == NULL) *pathp = malloc(MAXPATHLEN, M_AUDITPATH, M_WAITOK); audit_canon_path(td, upath, *pathp); ARG_SET_VALID(ar, flag); } /* * Function to save the path and vnode attr information into the audit * record. * * It is assumed that the caller will hold any vnode locks necessary to * perform a VOP_GETATTR() on the passed vnode. * * XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but always * provides access to the generation number as we need that to construct the * BSM file ID. * * XXX: We should accept the process argument from the caller, since it's * very likely they already have a reference. * * XXX: Error handling in this function is poor. * * XXXAUDIT: Possibly KASSERT the path pointer is NULL? */ void audit_arg_vnode(struct vnode *vp, u_int64_t flags) { struct kaudit_record *ar; struct vattr vattr; int error; struct vnode_au_info *vnp; KASSERT(vp != NULL, ("audit_arg_vnode: vp == NULL")); KASSERT((flags == ARG_VNODE1) || (flags == ARG_VNODE2), ("audit_arg_vnode: flags %jd", (intmax_t)flags)); /* * Assume that if the caller is calling audit_arg_vnode() on a * non-MPSAFE vnode, then it will have acquired Giant. */ VFS_ASSERT_GIANT(vp->v_mount); ASSERT_VOP_LOCKED(vp, "audit_arg_vnode"); ar = currecord(); if (ar == NULL) return; /* * XXXAUDIT: The below clears, and then resets the flags for valid * arguments. Ideally, either the new vnode is used, or the old one * would be. */ if (flags & ARG_VNODE1) { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1); vnp = &ar->k_ar.ar_arg_vnode1; } else { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2); vnp = &ar->k_ar.ar_arg_vnode2; } error = VOP_GETATTR(vp, &vattr, curthread->td_ucred, curthread); if (error) { /* XXX: How to handle this case? */ return; } vnp->vn_mode = vattr.va_mode; vnp->vn_uid = vattr.va_uid; vnp->vn_gid = vattr.va_gid; vnp->vn_dev = vattr.va_rdev; vnp->vn_fsid = vattr.va_fsid; vnp->vn_fileid = vattr.va_fileid; vnp->vn_gen = vattr.va_gen; if (flags & ARG_VNODE1) ARG_SET_VALID(ar, ARG_VNODE1); else ARG_SET_VALID(ar, ARG_VNODE2); } /* * Audit the argument strings passed to exec. */ void audit_arg_argv(char *argv, int argc, int length) { struct kaudit_record *ar; if (audit_argv == 0) return; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_argv = malloc(length, M_AUDITTEXT, M_WAITOK); bcopy(argv, ar->k_ar.ar_arg_argv, length); ar->k_ar.ar_arg_argc = argc; ARG_SET_VALID(ar, ARG_ARGV); } /* * Audit the environment strings passed to exec. */ void audit_arg_envv(char *envv, int envc, int length) { struct kaudit_record *ar; if (audit_arge == 0) return; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_envv = malloc(length, M_AUDITTEXT, M_WAITOK); bcopy(envv, ar->k_ar.ar_arg_envv, length); ar->k_ar.ar_arg_envc = envc; ARG_SET_VALID(ar, ARG_ENVV); } /* * The close() system call uses it's own audit call to capture the path/vnode * information because those pieces are not easily obtained within the system * call itself. */ void audit_sysclose(struct thread *td, int fd) { struct kaudit_record *ar; struct vnode *vp; struct file *fp; int vfslocked; KASSERT(td != NULL, ("audit_sysclose: td == NULL")); ar = currecord(); if (ar == NULL) return; audit_arg_fd(fd); if (getvnode(td->td_proc->p_fd, fd, &fp) != 0) return; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); audit_arg_vnode(vp, ARG_VNODE1); VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); } Index: head/sys/security/mac/mac_inet.c =================================================================== --- head/sys/security/mac/mac_inet.c (revision 178284) +++ head/sys/security/mac/mac_inet.c (revision 178285) @@ -1,391 +1,391 @@ /*- * Copyright (c) 1999-2002, 2007 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2004 Networks Associates Technology, Inc. * Copyright (c) 2006 SPARTA, Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct label * mac_inpcb_label_alloc(int flag) { struct label *label; int error; label = mac_labelzone_alloc(flag); if (label == NULL) return (NULL); MAC_CHECK(inpcb_init_label, label, flag); if (error) { MAC_PERFORM(inpcb_destroy_label, label); mac_labelzone_free(label); return (NULL); } return (label); } int mac_inpcb_init(struct inpcb *inp, int flag) { inp->inp_label = mac_inpcb_label_alloc(flag); if (inp->inp_label == NULL) return (ENOMEM); return (0); } static struct label * mac_ipq_label_alloc(int flag) { struct label *label; int error; label = mac_labelzone_alloc(flag); if (label == NULL) return (NULL); MAC_CHECK(ipq_init_label, label, flag); if (error) { MAC_PERFORM(ipq_destroy_label, label); mac_labelzone_free(label); return (NULL); } return (label); } int mac_ipq_init(struct ipq *ipq, int flag) { ipq->ipq_label = mac_ipq_label_alloc(flag); if (ipq->ipq_label == NULL) return (ENOMEM); return (0); } static void mac_inpcb_label_free(struct label *label) { MAC_PERFORM(inpcb_destroy_label, label); mac_labelzone_free(label); } void mac_inpcb_destroy(struct inpcb *inp) { mac_inpcb_label_free(inp->inp_label); inp->inp_label = NULL; } static void mac_ipq_label_free(struct label *label) { MAC_PERFORM(ipq_destroy_label, label); mac_labelzone_free(label); } void mac_ipq_destroy(struct ipq *ipq) { mac_ipq_label_free(ipq->ipq_label); ipq->ipq_label = NULL; } void mac_inpcb_create(struct socket *so, struct inpcb *inp) { MAC_PERFORM(inpcb_create, so, so->so_label, inp, inp->inp_label); } void mac_ipq_reassemble(struct ipq *ipq, struct mbuf *m) { struct label *label; label = mac_mbuf_to_label(m); MAC_PERFORM(ipq_reassemble, ipq, ipq->ipq_label, m, label); } void mac_netinet_fragment(struct mbuf *m, struct mbuf *frag) { struct label *mlabel, *fraglabel; mlabel = mac_mbuf_to_label(m); fraglabel = mac_mbuf_to_label(frag); MAC_PERFORM(netinet_fragment, m, mlabel, frag, fraglabel); } void mac_ipq_create(struct mbuf *m, struct ipq *ipq) { struct label *label; label = mac_mbuf_to_label(m); MAC_PERFORM(ipq_create, m, label, ipq, ipq->ipq_label); } void mac_inpcb_create_mbuf(struct inpcb *inp, struct mbuf *m) { struct label *mlabel; - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); mlabel = mac_mbuf_to_label(m); MAC_PERFORM(inpcb_create_mbuf, inp, inp->inp_label, m, mlabel); } int mac_ipq_match(struct mbuf *m, struct ipq *ipq) { struct label *label; int result; label = mac_mbuf_to_label(m); result = 1; MAC_BOOLEAN(ipq_match, &&, m, label, ipq, ipq->ipq_label); return (result); } void mac_netinet_arp_send(struct ifnet *ifp, struct mbuf *m) { struct label *mlabel; mlabel = mac_mbuf_to_label(m); MAC_IFNET_LOCK(ifp); MAC_PERFORM(netinet_arp_send, ifp, ifp->if_label, m, mlabel); MAC_IFNET_UNLOCK(ifp); } void mac_netinet_icmp_reply(struct mbuf *mrecv, struct mbuf *msend) { struct label *mrecvlabel, *msendlabel; mrecvlabel = mac_mbuf_to_label(mrecv); msendlabel = mac_mbuf_to_label(msend); MAC_PERFORM(netinet_icmp_reply, mrecv, mrecvlabel, msend, msendlabel); } void mac_netinet_icmp_replyinplace(struct mbuf *m) { struct label *label; label = mac_mbuf_to_label(m); MAC_PERFORM(netinet_icmp_replyinplace, m, label); } void mac_netinet_igmp_send(struct ifnet *ifp, struct mbuf *m) { struct label *mlabel; mlabel = mac_mbuf_to_label(m); MAC_IFNET_LOCK(ifp); MAC_PERFORM(netinet_igmp_send, ifp, ifp->if_label, m, mlabel); MAC_IFNET_UNLOCK(ifp); } void mac_netinet_tcp_reply(struct mbuf *m) { struct label *label; label = mac_mbuf_to_label(m); MAC_PERFORM(netinet_tcp_reply, m, label); } void mac_ipq_update(struct mbuf *m, struct ipq *ipq) { struct label *label; label = mac_mbuf_to_label(m); MAC_PERFORM(ipq_update, m, label, ipq, ipq->ipq_label); } int mac_inpcb_check_deliver(struct inpcb *inp, struct mbuf *m) { struct label *label; int error; M_ASSERTPKTHDR(m); label = mac_mbuf_to_label(m); MAC_CHECK(inpcb_check_deliver, inp, inp->inp_label, m, label); return (error); } void mac_inpcb_sosetlabel(struct socket *so, struct inpcb *inp) { - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); SOCK_LOCK_ASSERT(so); MAC_PERFORM(inpcb_sosetlabel, so, so->so_label, inp, inp->inp_label); } void mac_netinet_firewall_reply(struct mbuf *mrecv, struct mbuf *msend) { struct label *mrecvlabel, *msendlabel; M_ASSERTPKTHDR(mrecv); M_ASSERTPKTHDR(msend); mrecvlabel = mac_mbuf_to_label(mrecv); msendlabel = mac_mbuf_to_label(msend); MAC_PERFORM(netinet_firewall_reply, mrecv, mrecvlabel, msend, msendlabel); } void mac_netinet_firewall_send(struct mbuf *m) { struct label *label; M_ASSERTPKTHDR(m); label = mac_mbuf_to_label(m); MAC_PERFORM(netinet_firewall_send, m, label); } /* * These functions really should be referencing the syncache structure * instead of the label. However, due to some of the complexities associated * with exposing this syncache structure we operate directly on it's label * pointer. This should be OK since we aren't making any access control * decisions within this code directly, we are merely allocating and copying * label storage so we can properly initialize mbuf labels for any packets * the syncache code might create. */ void mac_syncache_destroy(struct label **label) { MAC_PERFORM(syncache_destroy_label, *label); mac_labelzone_free(*label); *label = NULL; } int mac_syncache_init(struct label **label) { int error; *label = mac_labelzone_alloc(M_NOWAIT); if (*label == NULL) return (ENOMEM); /* * Since we are holding the inpcb locks the policy can not allocate * policy specific label storage using M_WAITOK. So we need to do a * MAC_CHECK instead of the typical MAC_PERFORM so we can propagate * allocation failures back to the syncache code. */ MAC_CHECK(syncache_init_label, *label, M_NOWAIT); return (error); } void mac_syncache_create(struct label *label, struct inpcb *inp) { - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); MAC_PERFORM(syncache_create, label, inp); } void mac_syncache_create_mbuf(struct label *sc_label, struct mbuf *m) { struct label *mlabel; M_ASSERTPKTHDR(m); mlabel = mac_mbuf_to_label(m); MAC_PERFORM(syncache_create_mbuf, sc_label, m, mlabel); }