diff --git a/sys/net/altq/altq_subr.c b/sys/net/altq/altq_subr.c index 14d9916011da..39f91c4daf63 100644 --- a/sys/net/altq/altq_subr.c +++ b/sys/net/altq/altq_subr.c @@ -1,1956 +1,1956 @@ /*- * Copyright (C) 1997-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ * $FreeBSD$ */ #include "opt_altq.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include /* machine dependent clock related includes */ #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include /* for pentium tsc */ #include /* for CPUID_TSC */ #include /* for cpu_feature */ #endif /* __amd64 || __i386__ */ /* * internal function prototypes */ static void tbr_timeout(void *); int (*altq_input)(struct mbuf *, int) = NULL; static struct mbuf *tbr_dequeue(struct ifaltq *, int); static int tbr_timer = 0; /* token bucket regulator timer */ #if !defined(__FreeBSD__) || (__FreeBSD_version < 600000) static struct callout tbr_callout = CALLOUT_INITIALIZER; #else static struct callout tbr_callout; #endif #ifdef ALTQ3_CLFIER_COMPAT static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); #ifdef INET6 static int extract_ports6(struct mbuf *, struct ip6_hdr *, struct flowinfo_in6 *); #endif static int apply_filter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static int apply_ppfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); #ifdef INET6 static int apply_filter6(u_int32_t, struct flow_filter6 *, struct flowinfo_in6 *); #endif static int apply_tosfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static u_long get_filt_handle(struct acc_classifier *, int); static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); static u_int32_t filt2fibmask(struct flow_filter *); static void ip4f_cache(struct ip *, struct flowinfo_in *); static int ip4f_lookup(struct ip *, struct flowinfo_in *); static int ip4f_init(void); static struct ip4_frag *ip4f_alloc(void); static void ip4f_free(struct ip4_frag *); #endif /* ALTQ3_CLFIER_COMPAT */ #ifdef ALTQ SYSCTL_NODE(_kern_features, OID_AUTO, altq, CTLFLAG_RD | CTLFLAG_CAPRD, 0, "ALTQ packet queuing"); #define ALTQ_FEATURE(name, desc) \ SYSCTL_INT_WITH_LABEL(_kern_features_altq, OID_AUTO, name, \ CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, \ desc, "feature") #ifdef ALTQ_CBQ ALTQ_FEATURE(cbq, "ALTQ Class Based Queuing discipline"); #endif #ifdef ALTQ_CODEL ALTQ_FEATURE(codel, "ALTQ Controlled Delay discipline"); #endif #ifdef ALTQ_RED ALTQ_FEATURE(red, "ALTQ Random Early Detection discipline"); #endif #ifdef ALTQ_RIO ALTQ_FEATURE(rio, "ALTQ Random Early Drop discipline"); #endif #ifdef ALTQ_HFSC ALTQ_FEATURE(hfsc, "ALTQ Hierarchical Packet Scheduler discipline"); #endif #ifdef ALTQ_PRIQ ALTQ_FEATURE(priq, "ATLQ Priority Queuing discipline"); #endif #ifdef ALTQ_FAIRQ ALTQ_FEATURE(fairq, "ALTQ Fair Queuing discipline"); #endif #endif /* * alternate queueing support routines */ /* look up the queue state by the interface name and the queueing type. */ void * altq_lookup(name, type) char *name; int type; { struct ifnet *ifp; if ((ifp = ifunit(name)) != NULL) { /* read if_snd unlocked */ if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) return (ifp->if_snd.altq_disc); } return NULL; } int altq_attach(ifq, type, discipline, enqueue, dequeue, request) struct ifaltq *ifq; int type; void *discipline; int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); struct mbuf *(*dequeue)(struct ifaltq *, int); int (*request)(struct ifaltq *, int, void *); { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } ifq->altq_type = type; ifq->altq_disc = discipline; ifq->altq_enqueue = enqueue; ifq->altq_dequeue = dequeue; ifq->altq_request = request; ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); IFQ_UNLOCK(ifq); return 0; } int altq_detach(ifq) struct ifaltq *ifq; { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return EBUSY; } if (!ALTQ_IS_ATTACHED(ifq)) { IFQ_UNLOCK(ifq); return (0); } ifq->altq_type = ALTQT_NONE; ifq->altq_disc = NULL; ifq->altq_enqueue = NULL; ifq->altq_dequeue = NULL; ifq->altq_request = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; IFQ_UNLOCK(ifq); return 0; } int altq_enable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } s = splnet(); IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->ifq_drv_maxlen = 0; /* disable bulk dequeue */ ifq->altq_flags |= ALTQF_ENABLED; splx(s); IFQ_UNLOCK(ifq); return 0; } int altq_disable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } s = splnet(); IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->altq_flags &= ~(ALTQF_ENABLED); splx(s); IFQ_UNLOCK(ifq); return 0; } #ifdef ALTQ_DEBUG void altq_assert(file, line, failedexpr) const char *file, *failedexpr; int line; { (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", failedexpr, file, line); panic("altq assertion"); /* NOTREACHED */ } #endif /* * internal representation of token bucket parameters * rate: (byte_per_unittime << TBR_SHIFT) / machclk_freq * (((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq * depth: byte << TBR_SHIFT * */ #define TBR_SHIFT 29 #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) static struct mbuf * tbr_dequeue(ifq, op) struct ifaltq *ifq; int op; { struct tb_regulator *tbr; struct mbuf *m; int64_t interval; u_int64_t now; IFQ_LOCK_ASSERT(ifq); tbr = ifq->altq_tbr; if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { /* if this is a remove after poll, bypass tbr check */ } else { /* update token only when it is negative */ if (tbr->tbr_token <= 0) { now = read_machclk(); interval = now - tbr->tbr_last; if (interval >= tbr->tbr_filluptime) tbr->tbr_token = tbr->tbr_depth; else { tbr->tbr_token += interval * tbr->tbr_rate; if (tbr->tbr_token > tbr->tbr_depth) tbr->tbr_token = tbr->tbr_depth; } tbr->tbr_last = now; } /* if token is still negative, don't allow dequeue */ if (tbr->tbr_token <= 0) return (NULL); } if (ALTQ_IS_ENABLED(ifq)) m = (*ifq->altq_dequeue)(ifq, op); else { if (op == ALTDQ_POLL) _IF_POLL(ifq, m); else _IF_DEQUEUE(ifq, m); } if (m != NULL && op == ALTDQ_REMOVE) tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); tbr->tbr_lastop = op; return (m); } /* * set a token bucket regulator. * if the specified rate is zero, the token bucket regulator is deleted. */ int tbr_set(ifq, profile) struct ifaltq *ifq; struct tb_profile *profile; { struct tb_regulator *tbr, *otbr; if (tbr_dequeue_ptr == NULL) tbr_dequeue_ptr = tbr_dequeue; if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) { printf("tbr_set: no cpu clock available!\n"); return (ENXIO); } IFQ_LOCK(ifq); if (profile->rate == 0) { /* delete this tbr */ if ((tbr = ifq->altq_tbr) == NULL) { IFQ_UNLOCK(ifq); return (ENOENT); } ifq->altq_tbr = NULL; free(tbr, M_DEVBUF); IFQ_UNLOCK(ifq); return (0); } tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO); if (tbr == NULL) { IFQ_UNLOCK(ifq); return (ENOMEM); } tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; tbr->tbr_depth = TBR_SCALE(profile->depth); if (tbr->tbr_rate > 0) tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; else tbr->tbr_filluptime = LLONG_MAX; /* * The longest time between tbr_dequeue() calls will be about 1 * system tick, as the callout that drives it is scheduled once per * tick. The refill-time detection logic in tbr_dequeue() can only * properly detect the passage of up to LLONG_MAX machclk ticks. * Therefore, in order for this logic to function properly in the * extreme case, the maximum value of tbr_filluptime should be * LLONG_MAX less one system tick's worth of machclk ticks less * some additional slop factor (here one more system tick's worth * of machclk ticks). */ if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick)) tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick; tbr->tbr_token = tbr->tbr_depth; tbr->tbr_last = read_machclk(); tbr->tbr_lastop = ALTDQ_REMOVE; otbr = ifq->altq_tbr; ifq->altq_tbr = tbr; /* set the new tbr */ if (otbr != NULL) free(otbr, M_DEVBUF); else { if (tbr_timer == 0) { CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); tbr_timer = 1; } } IFQ_UNLOCK(ifq); return (0); } /* * tbr_timeout goes through the interface list, and kicks the drivers * if necessary. * * MPSAFE */ static void tbr_timeout(arg) void *arg; { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; struct epoch_tracker et; int active; active = 0; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { /* read from if_snd unlocked */ if (!TBR_IS_ENABLED(&ifp->if_snd)) continue; active++; if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) (*ifp->if_start)(ifp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); if (active > 0) CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); else tbr_timer = 0; /* don't need tbr_timer anymore */ } /* * attach a discipline to the interface. if one already exists, it is * overridden. * Locking is done in the discipline specific attach functions. Basically * they call back to altq_attach which takes care of the attach and locking. */ int altq_pfattach(struct pf_altq *a) { int error = 0; switch (a->scheduler) { case ALTQT_NONE: break; #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_pfattach(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_pfattach(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_pfattach(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_pfattach(a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_pfattach(a); break; #endif default: error = ENXIO; } return (error); } /* * detach a discipline from the interface. * it is possible that the discipline was already overridden by another * discipline. */ int altq_pfdetach(struct pf_altq *a) { struct ifnet *ifp; int s, error = 0; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); /* if this discipline is no longer referenced, just return */ /* read unlocked from if_snd */ if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) return (0); s = splnet(); /* read unlocked from if_snd, _disable and _detach take care */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) error = altq_disable(&ifp->if_snd); if (error == 0) error = altq_detach(&ifp->if_snd); splx(s); return (error); } /* * add a discipline or a queue * Locking is done in the discipline specific functions with regards to * malloc with WAITOK, also it is not yet clear which lock to use. */ int altq_add(struct ifnet *ifp, struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_add_queue(a)); if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) panic("altq_add: no cpu clock"); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_altq(ifp, a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_altq(ifp, a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_altq(ifp, a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_add_altq(ifp, a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_add_altq(ifp, a); break; #endif default: error = ENXIO; } return (error); } /* * remove a discipline or a queue * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove(struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_remove_queue(a)); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_altq(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_altq(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_altq(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_remove_altq(a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_remove_altq(a); break; #endif default: error = ENXIO; } return (error); } /* * add a queue to the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_add_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_queue(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_add_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * remove a queue from the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_queue(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_remove_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * get queue statistics * Locking is done in the discipline specific functions with regards to * copyout operations, also it is not yet clear which lock to use. */ int altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_getqstats(a, ubuf, nbytes, version); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_getqstats(a, ubuf, nbytes, version); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_getqstats(a, ubuf, nbytes, version); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_getqstats(a, ubuf, nbytes, version); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_getqstats(a, ubuf, nbytes, version); break; #endif default: error = ENXIO; } return (error); } /* * read and write diffserv field in IPv4 or IPv6 header */ u_int8_t read_dsfield(m, pktattr) struct mbuf *m; struct altq_pktattr *pktattr; { struct mbuf *m0; u_int8_t ds_field = 0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return ((u_int8_t)0); /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("read_dsfield: can't locate header!\n"); #endif return ((u_int8_t)0); } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; if (ip->ip_v != 4) return ((u_int8_t)0); /* version mismatch! */ ds_field = ip->ip_tos; } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return ((u_int8_t)0); /* version mismatch! */ ds_field = (flowlabel >> 20) & 0xff; } #endif return (ds_field); } void write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield) { struct mbuf *m0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return; /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("write_dsfield: can't locate header!\n"); #endif return; } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; u_int8_t old; int32_t sum; if (ip->ip_v != 4) return; /* version mismatch! */ old = ip->ip_tos; dsfield |= old & 3; /* leave CU bits */ if (old == dsfield) return; ip->ip_tos = dsfield; /* * update checksum (from RFC1624) * HC' = ~(~HC + ~m + m') */ sum = ~ntohs(ip->ip_sum) & 0xffff; sum += 0xff00 + (~old & 0xff) + dsfield; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); /* add carry */ ip->ip_sum = htons(~sum & 0xffff); } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return; /* version mismatch! */ flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); ip6->ip6_flow = htonl(flowlabel); } #endif return; } /* * high resolution clock support taking advantage of a machine dependent * high resolution time counter (e.g., timestamp counter of intel pentium). * we assume * - 64-bit-long monotonically-increasing counter * - frequency range is 100M-4GHz (CPU speed) */ /* if pcc is not available or disabled, emulate 256MHz using microtime() */ #define MACHCLK_SHIFT 8 int machclk_usepcc; u_int32_t machclk_freq; u_int32_t machclk_per_tick; #if defined(__i386__) && defined(__NetBSD__) extern u_int64_t cpu_tsc_freq; #endif #if (__FreeBSD_version >= 700035) /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { /* If there was an error during the transition, don't do anything. */ if (status != 0) return; #if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__)) /* If TSC is P-state invariant, don't do anything. */ if (tsc_is_invariant) return; #endif /* Total setting for this level gives the new frequency in MHz. */ init_machclk(); } EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_LAST); #endif /* __FreeBSD_version >= 700035 */ static void init_machclk_setup(void) { #if (__FreeBSD_version >= 600000) callout_init(&tbr_callout, 0); #endif machclk_usepcc = 1; #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC) machclk_usepcc = 0; #endif #if defined(__FreeBSD__) && defined(SMP) machclk_usepcc = 0; #endif #if defined(__NetBSD__) && defined(MULTIPROCESSOR) machclk_usepcc = 0; #endif #if defined(__amd64__) || defined(__i386__) /* check if TSC is available */ if ((cpu_feature & CPUID_TSC) == 0 || atomic_load_acq_64(&tsc_freq) == 0) machclk_usepcc = 0; #endif } void init_machclk(void) { static int called; /* Call one-time initialization function. */ if (!called) { init_machclk_setup(); called = 1; } if (machclk_usepcc == 0) { /* emulate 256MHz using microtime() */ machclk_freq = 1000000 << MACHCLK_SHIFT; machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: emulate %uHz cpu clock\n", machclk_freq); #endif return; } /* * if the clock frequency (of Pentium TSC or Alpha PCC) is * accessible, just use it. */ #if defined(__amd64__) || defined(__i386__) machclk_freq = atomic_load_acq_64(&tsc_freq); #endif /* * if we don't know the clock frequency, measure it. */ if (machclk_freq == 0) { static int wait; struct timeval tv_start, tv_end; u_int64_t start, end, diff; int timo; microtime(&tv_start); start = read_machclk(); timo = hz; /* 1 sec */ (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); microtime(&tv_end); end = read_machclk(); diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + tv_end.tv_usec - tv_start.tv_usec; if (diff != 0) machclk_freq = (u_int)((end - start) * 1000000 / diff); } machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: CPU clock: %uHz\n", machclk_freq); #endif } #if defined(__OpenBSD__) && defined(__i386__) static __inline u_int64_t rdtsc(void) { u_int64_t rv; __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); return (rv); } #endif /* __OpenBSD__ && __i386__ */ u_int64_t read_machclk(void) { u_int64_t val; if (machclk_usepcc) { #if defined(__amd64__) || defined(__i386__) val = rdtsc(); #else panic("read_machclk"); #endif } else { struct timeval tv, boottime; microtime(&tv); getboottime(&boottime); val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + tv.tv_usec) << MACHCLK_SHIFT); } return (val); } #ifdef ALTQ3_CLFIER_COMPAT #ifndef IPPROTO_ESP #define IPPROTO_ESP 50 /* encapsulating security payload */ #endif #ifndef IPPROTO_AH #define IPPROTO_AH 51 /* authentication header */ #endif /* * extract flow information from a given packet. * filt_mask shows flowinfo fields required. * we assume the ip header is in one mbuf, and addresses and ports are * in network byte order. */ int altq_extractflow(m, af, flow, filt_bmask) struct mbuf *m; int af; struct flowinfo *flow; u_int32_t filt_bmask; { switch (af) { case PF_INET: { struct flowinfo_in *fin; struct ip *ip; ip = mtod(m, struct ip *); if (ip->ip_v != 4) break; fin = (struct flowinfo_in *)flow; fin->fi_len = sizeof(struct flowinfo_in); fin->fi_family = AF_INET; fin->fi_proto = ip->ip_p; fin->fi_tos = ip->ip_tos; fin->fi_src.s_addr = ip->ip_src.s_addr; fin->fi_dst.s_addr = ip->ip_dst.s_addr; if (filt_bmask & FIMB4_PORTS) /* if port info is required, extract port numbers */ extract_ports4(m, ip, fin); else { fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; } return (1); } #ifdef INET6 case PF_INET6: { struct flowinfo_in6 *fin6; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); /* should we check the ip version? */ fin6 = (struct flowinfo_in6 *)flow; fin6->fi6_len = sizeof(struct flowinfo_in6); fin6->fi6_family = AF_INET6; fin6->fi6_proto = ip6->ip6_nxt; - fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + fin6->fi6_tclass = IPV6_TRAFFIC_CLASS(ip6); fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); fin6->fi6_src = ip6->ip6_src; fin6->fi6_dst = ip6->ip6_dst; if ((filt_bmask & FIMB6_PORTS) || ((filt_bmask & FIMB6_PROTO) && ip6->ip6_nxt > IPPROTO_IPV6)) /* * if port info is required, or proto is required * but there are option headers, extract port * and protocol numbers. */ extract_ports6(m, ip6, fin6); else { fin6->fi6_sport = 0; fin6->fi6_dport = 0; fin6->fi6_gpi = 0; } return (1); } #endif /* INET6 */ default: break; } /* failed */ flow->fi_len = sizeof(struct flowinfo); flow->fi_family = AF_UNSPEC; return (0); } /* * helper routine to extract port numbers */ /* structure for ipsec and ipv6 option header template */ struct _opt6 { u_int8_t opt6_nxt; /* next header */ u_int8_t opt6_hlen; /* header extension length */ u_int16_t _pad; u_int32_t ah_spi; /* security parameter index for authentication header */ }; /* * extract port numbers from a ipv4 packet. */ static int extract_ports4(m, ip, fin) struct mbuf *m; struct ip *ip; struct flowinfo_in *fin; { struct mbuf *m0; u_short ip_off; u_int8_t proto; int off; fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; ip_off = ntohs(ip->ip_off); /* if it is a fragment, try cached fragment info */ if (ip_off & IP_OFFMASK) { ip4f_lookup(ip, fin); return (1); } /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip >= m0->m_data) && ((caddr_t)ip < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports4: can't locate header! ip=%p\n", ip); #endif return (0); } off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); proto = ip->ip_p; #ifdef ALTQ_IPSEC again: #endif while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); /* bogus ip_hl! */ } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin->fi_sport = udp->uh_sport; fin->fi_dport = udp->uh_dport; fin->fi_proto = proto; } break; #ifdef ALTQ_IPSEC case IPPROTO_ESP: if (fin->fi_gpi == 0){ u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin->fi_gpi = *gpi; } fin->fi_proto = proto; break; case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); if (fin->fi_gpi == 0 && m0->m_len >= off + 8) fin->fi_gpi = opt6->ah_spi; } /* goto the next header */ goto again; #endif /* ALTQ_IPSEC */ default: fin->fi_proto = proto; return (0); } /* if this is a first fragment, cache it. */ if (ip_off & IP_MF) ip4f_cache(ip, fin); return (1); } #ifdef INET6 static int extract_ports6(m, ip6, fin6) struct mbuf *m; struct ip6_hdr *ip6; struct flowinfo_in6 *fin6; { struct mbuf *m0; int off; u_int8_t proto; fin6->fi6_gpi = 0; fin6->fi6_sport = 0; fin6->fi6_dport = 0; /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip6 >= m0->m_data) && ((caddr_t)ip6 < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports6: can't locate header! ip6=%p\n", ip6); #endif return (0); } off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; do { while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin6->fi6_sport = udp->uh_sport; fin6->fi6_dport = udp->uh_dport; fin6->fi6_proto = proto; } return (1); case IPPROTO_ESP: if (fin6->fi6_gpi == 0) { u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin6->fi6_gpi = *gpi; } fin6->fi6_proto = proto; return (1); case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) fin6->fi6_gpi = opt6->ah_spi; proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); /* goto the next header */ break; } case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += (opt6->opt6_hlen + 1) * 8; /* goto the next header */ break; } case IPPROTO_FRAGMENT: /* ipv6 fragmentations are not supported yet */ default: fin6->fi6_proto = proto; return (0); } } while (1); /*NOTREACHED*/ } #endif /* INET6 */ /* * altq common classifier */ int acc_add_filter(classifier, filter, class, phandle) struct acc_classifier *classifier; struct flow_filter *filter; void *class; u_long *phandle; { struct acc_filter *afp, *prev, *tmp; int i, s; #ifdef INET6 if (filter->ff_flow.fi_family != AF_INET && filter->ff_flow.fi_family != AF_INET6) return (EINVAL); #else if (filter->ff_flow.fi_family != AF_INET) return (EINVAL); #endif afp = malloc(sizeof(struct acc_filter), M_DEVBUF, M_WAITOK); if (afp == NULL) return (ENOMEM); bzero(afp, sizeof(struct acc_filter)); afp->f_filter = *filter; afp->f_class = class; i = ACC_WILDCARD_INDEX; if (filter->ff_flow.fi_family == AF_INET) { struct flow_filter *filter4 = &afp->f_filter; /* * if address is 0, it's a wildcard. if address mask * isn't set, use full mask. */ if (filter4->ff_flow.fi_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0; else if (filter4->ff_mask.mask_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0xffffffff; if (filter4->ff_flow.fi_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0; else if (filter4->ff_mask.mask_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0xffffffff; /* clear extra bits in addresses */ filter4->ff_flow.fi_dst.s_addr &= filter4->ff_mask.mask_dst.s_addr; filter4->ff_flow.fi_src.s_addr &= filter4->ff_mask.mask_src.s_addr; /* * if dst address is a wildcard, use hash-entry * ACC_WILDCARD_INDEX. */ if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); } #ifdef INET6 else if (filter->ff_flow.fi_family == AF_INET6) { struct flow_filter6 *filter6 = (struct flow_filter6 *)&afp->f_filter; #ifndef IN6MASK0 /* taken from kame ipv6 */ #define IN6MASK0 {{{ 0, 0, 0, 0 }}} #define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask128 = IN6MASK128; #endif if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) filter6->ff_mask6.mask6_dst = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) filter6->ff_mask6.mask6_dst = in6mask128; if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) filter6->ff_mask6.mask6_src = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) filter6->ff_mask6.mask6_src = in6mask128; /* clear extra bits in addresses */ for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_dst.s6_addr[i] &= filter6->ff_mask6.mask6_dst.s6_addr[i]; for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_src.s6_addr[i] &= filter6->ff_mask6.mask6_src.s6_addr[i]; if (filter6->ff_flow6.fi6_flowlabel == 0) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); } #endif /* INET6 */ afp->f_handle = get_filt_handle(classifier, i); /* update filter bitmask */ afp->f_fbmask = filt2fibmask(filter); classifier->acc_fbmask |= afp->f_fbmask; /* * add this filter to the filter list. * filters are ordered from the highest rule number. */ s = splnet(); prev = NULL; LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) prev = tmp; else break; } if (prev == NULL) LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); else LIST_INSERT_AFTER(prev, afp, f_chain); splx(s); *phandle = afp->f_handle; return (0); } int acc_delete_filter(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int s; if ((afp = filth_to_filtp(classifier, handle)) == NULL) return (EINVAL); s = splnet(); LIST_REMOVE(afp, f_chain); splx(s); free(afp, M_DEVBUF); /* todo: update filt_bmask */ return (0); } /* * delete filters referencing to the specified class. * if the all flag is not 0, delete all the filters. */ int acc_discard_filters(classifier, class, all) struct acc_classifier *classifier; void *class; int all; { struct acc_filter *afp; int i, s; s = splnet(); for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (all || afp->f_class == class) { LIST_REMOVE(afp, f_chain); free(afp, M_DEVBUF); /* start again from the head */ break; } } while (afp != NULL); } splx(s); if (all) classifier->acc_fbmask = 0; return (0); } void * acc_classify(clfier, m, af) void *clfier; struct mbuf *m; int af; { struct acc_classifier *classifier; struct flowinfo flow; struct acc_filter *afp; int i; classifier = (struct acc_classifier *)clfier; altq_extractflow(m, af, &flow, classifier->acc_fbmask); if (flow.fi_family == AF_INET) { struct flowinfo_in *fp = (struct flowinfo_in *)&flow; if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { /* only tos is used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_tosfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else if ((classifier->acc_fbmask & (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) == 0) { /* only proto and ports are used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_ppfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else { /* get the filter hash entry from its dest address */ i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); do { /* * go through this loop twice. first for dst * hash, second for wildcards. */ LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); /* * check again for filters with a dst addr * wildcard. * (daddr == 0 || dmask != 0xffffffff). */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } } #ifdef INET6 else if (flow.fi_family == AF_INET6) { struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; /* get the filter hash entry from its flow ID */ if (fp6->fi6_flowlabel != 0) i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); else /* flowlable can be zero */ i = ACC_WILDCARD_INDEX; /* go through this loop twice. first for flow hash, second for wildcards. */ do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter6(afp->f_fbmask, (struct flow_filter6 *)&afp->f_filter, fp6)) /* filter matched */ return (afp->f_class); /* * check again for filters with a wildcard. */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } #endif /* INET6 */ /* no filter matched */ return (NULL); } static int apply_filter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_DADDR) && filt->ff_flow.fi_dst.s_addr != (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) return (0); if ((fbmask & FIMB4_SADDR) && filt->ff_flow.fi_src.s_addr != (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) return (0); /* match */ return (1); } /* * filter matching function optimized for a common case that checks * only protocol and port numbers */ static int apply_ppfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); /* match */ return (1); } /* * filter matching function only for tos field. */ static int apply_tosfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); /* match */ return (1); } #ifdef INET6 static int apply_filter6(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter6 *filt; struct flowinfo_in6 *pkt; { int i; if (filt->ff_flow6.fi6_family != AF_INET6) return (0); if ((fbmask & FIMB6_FLABEL) && filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) return (0); if ((fbmask & FIMB6_PROTO) && filt->ff_flow6.fi6_proto != pkt->fi6_proto) return (0); if ((fbmask & FIMB6_SPORT) && filt->ff_flow6.fi6_sport != pkt->fi6_sport) return (0); if ((fbmask & FIMB6_DPORT) && filt->ff_flow6.fi6_dport != pkt->fi6_dport) return (0); if (fbmask & FIMB6_SADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_src.s6_addr32[i] != (pkt->fi6_src.s6_addr32[i] & filt->ff_mask6.mask6_src.s6_addr32[i])) return (0); } if (fbmask & FIMB6_DADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_dst.s6_addr32[i] != (pkt->fi6_dst.s6_addr32[i] & filt->ff_mask6.mask6_dst.s6_addr32[i])) return (0); } if ((fbmask & FIMB6_TCLASS) && filt->ff_flow6.fi6_tclass != (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) return (0); if ((fbmask & FIMB6_GPI) && filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) return (0); /* match */ return (1); } #endif /* INET6 */ /* * filter handle: * bit 20-28: index to the filter hash table * bit 0-19: unique id in the hash bucket. */ static u_long get_filt_handle(classifier, i) struct acc_classifier *classifier; int i; { static u_long handle_number = 1; u_long handle; struct acc_filter *afp; while (1) { handle = handle_number++ & 0x000fffff; if (LIST_EMPTY(&classifier->acc_filters[i])) break; LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if ((afp->f_handle & 0x000fffff) == handle) break; if (afp == NULL) break; /* this handle is already used, try again */ } return ((i << 20) | handle); } /* convert filter handle to filter pointer */ static struct acc_filter * filth_to_filtp(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int i; i = ACC_GET_HINDEX(handle); LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (afp->f_handle == handle) return (afp); return (NULL); } /* create flowinfo bitmask */ static u_int32_t filt2fibmask(filt) struct flow_filter *filt; { u_int32_t mask = 0; #ifdef INET6 struct flow_filter6 *filt6; #endif switch (filt->ff_flow.fi_family) { case AF_INET: if (filt->ff_flow.fi_proto != 0) mask |= FIMB4_PROTO; if (filt->ff_flow.fi_tos != 0) mask |= FIMB4_TOS; if (filt->ff_flow.fi_dst.s_addr != 0) mask |= FIMB4_DADDR; if (filt->ff_flow.fi_src.s_addr != 0) mask |= FIMB4_SADDR; if (filt->ff_flow.fi_sport != 0) mask |= FIMB4_SPORT; if (filt->ff_flow.fi_dport != 0) mask |= FIMB4_DPORT; if (filt->ff_flow.fi_gpi != 0) mask |= FIMB4_GPI; break; #ifdef INET6 case AF_INET6: filt6 = (struct flow_filter6 *)filt; if (filt6->ff_flow6.fi6_proto != 0) mask |= FIMB6_PROTO; if (filt6->ff_flow6.fi6_tclass != 0) mask |= FIMB6_TCLASS; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) mask |= FIMB6_DADDR; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) mask |= FIMB6_SADDR; if (filt6->ff_flow6.fi6_sport != 0) mask |= FIMB6_SPORT; if (filt6->ff_flow6.fi6_dport != 0) mask |= FIMB6_DPORT; if (filt6->ff_flow6.fi6_gpi != 0) mask |= FIMB6_GPI; if (filt6->ff_flow6.fi6_flowlabel != 0) mask |= FIMB6_FLABEL; break; #endif /* INET6 */ } return (mask); } /* * helper functions to handle IPv4 fragments. * currently only in-sequence fragments are handled. * - fragment info is cached in a LRU list. * - when a first fragment is found, cache its flow info. * - when a non-first fragment is found, lookup the cache. */ struct ip4_frag { TAILQ_ENTRY(ip4_frag) ip4f_chain; char ip4f_valid; u_short ip4f_id; struct flowinfo_in ip4f_info; }; static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ #define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ static void ip4f_cache(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; if (TAILQ_EMPTY(&ip4f_list)) { /* first time call, allocate fragment cache entries. */ if (ip4f_init() < 0) /* allocation failed! */ return; } fp = ip4f_alloc(); fp->ip4f_id = ip->ip_id; fp->ip4f_info.fi_proto = ip->ip_p; fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; /* save port numbers */ fp->ip4f_info.fi_sport = fin->fi_sport; fp->ip4f_info.fi_dport = fin->fi_dport; fp->ip4f_info.fi_gpi = fin->fi_gpi; } static int ip4f_lookup(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; fp = TAILQ_NEXT(fp, ip4f_chain)) if (ip->ip_id == fp->ip4f_id && ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && ip->ip_p == fp->ip4f_info.fi_proto) { /* found the matching entry */ fin->fi_sport = fp->ip4f_info.fi_sport; fin->fi_dport = fp->ip4f_info.fi_dport; fin->fi_gpi = fp->ip4f_info.fi_gpi; if ((ntohs(ip->ip_off) & IP_MF) == 0) /* this is the last fragment, release the entry. */ ip4f_free(fp); return (1); } /* no matching entry found */ return (0); } static int ip4f_init(void) { struct ip4_frag *fp; int i; TAILQ_INIT(&ip4f_list); for (i=0; iip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } return (0); } static struct ip4_frag * ip4f_alloc(void) { struct ip4_frag *fp; /* reclaim an entry at the tail, put it at the head */ fp = TAILQ_LAST(&ip4f_list, ip4f_list); TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 1; TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); return (fp); } static void ip4f_free(fp) struct ip4_frag *fp; { TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } #endif /* ALTQ3_CLFIER_COMPAT */ diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index c3f26db3f6e6..40f8a6f3a30a 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -1,762 +1,762 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; static int in_stf_input(struct mbuf *, int, int, void *); static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_IPV6, .min_length = sizeof(struct ip), .exact_match = (sizeof(in_addr_t) << 3) + 8, .check = stf_encapcheck, .input = in_stf_input }; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err __unused; err = ip_encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct rm_priotracker in_ifa_tracker; struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; return (0); } return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); - tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + tos = IPV6_TRAFFIC_CLASS(ip6); /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct nhop_object *nh; NET_EPOCH_ASSERT(); nh = fib4_lookup(sc->sc_fibnum, *in, 0, 0, 0); if (nh == NULL) return (-1); if (nh->nh_ifp != inifp) return (-1); } return 0; } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int in_stf_input(struct mbuf *m, int off, int proto, void *arg) { struct stf_softc *sc = arg; struct ip *ip; struct ip6_hdr *ip6; u_int8_t otos, itos; struct ifnet *ifp; NET_EPOCH_ASSERT(); if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } - itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + itos = IPV6_TRAFFIC_CLASS(ip6); if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h index 44c46fd3b71d..1bc79a98e689 100644 --- a/sys/netinet/ip6.h +++ b/sys/netinet/ip6.h @@ -1,260 +1,264 @@ /* $FreeBSD$ */ /* $KAME: ip6.h,v 1.18 2001/03/29 05:34:30 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET_IP6_H_ #define _NETINET_IP6_H_ /* * Definition for internet protocol version 6. * RFC 2460 */ struct ip6_hdr { union { struct ip6_hdrctl { u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ u_int16_t ip6_un1_plen; /* payload length */ u_int8_t ip6_un1_nxt; /* next header */ u_int8_t ip6_un1_hlim; /* hop limit */ } ip6_un1; u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ } ip6_ctlun; struct in6_addr ip6_src; /* source address */ struct in6_addr ip6_dst; /* destination address */ } __packed; #define ip6_vfc ip6_ctlun.ip6_un2_vfc #define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow #define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen #define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt #define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim #define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim #define IPV6_VERSION 0x60 #define IPV6_VERSION_MASK 0xf0 #if BYTE_ORDER == BIG_ENDIAN #define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */ #define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */ #else #if BYTE_ORDER == LITTLE_ENDIAN #define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ #define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ #endif /* LITTLE_ENDIAN */ #endif #define IPV6_FLOWLABEL_LEN 20 +#define IPV6_TRAFFIC_CLASS(ip6) ((ntohl((ip6)->ip6_flow) >> 20) & 0xff) +#define IPV6_DSCP(ip6) ((ntohl((ip6)->ip6_flow) >> 20) & 0xfc) +#define IPV6_ECN(ip6) ((ntohl((ip6)->ip6_flow) >> 20) & 0x03) + /* * Extension Headers */ struct ip6_ext { u_int8_t ip6e_nxt; u_int8_t ip6e_len; } __packed; /* Hop-by-Hop options header */ /* XXX should we pad it to force alignment on an 8-byte boundary? */ struct ip6_hbh { u_int8_t ip6h_nxt; /* next header */ u_int8_t ip6h_len; /* length in units of 8 octets */ /* followed by options */ } __packed; /* Destination options header */ /* XXX should we pad it to force alignment on an 8-byte boundary? */ struct ip6_dest { u_int8_t ip6d_nxt; /* next header */ u_int8_t ip6d_len; /* length in units of 8 octets */ /* followed by options */ } __packed; /* Option types and related macros */ #define IP6OPT_PAD1 0x00 /* 00 0 00000 */ #define IP6OPT_PADN 0x01 /* 00 0 00001 */ #define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */ #define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */ #define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */ #ifndef _KERNEL #define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */ #endif #define IP6OPT_ROUTER_ALERT 0x05 /* 00 0 00101 (RFC3542, recommended) */ #define IP6OPT_RTALERT_LEN 4 #define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */ #define IP6OPT_RTALERT_RSVP 1 /* Datagram contains an RSVP message */ #define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */ #define IP6OPT_MINLEN 2 #define IP6OPT_EID 0x8a /* 10 0 01010 */ #define IP6OPT_TYPE(o) ((o) & 0xC0) #define IP6OPT_TYPE_SKIP 0x00 #define IP6OPT_TYPE_DISCARD 0x40 #define IP6OPT_TYPE_FORCEICMP 0x80 #define IP6OPT_TYPE_ICMP 0xC0 #define IP6OPT_MUTABLE 0x20 /* IPv6 options: common part */ struct ip6_opt { u_int8_t ip6o_type; u_int8_t ip6o_len; } __packed; /* Jumbo Payload Option */ struct ip6_opt_jumbo { u_int8_t ip6oj_type; u_int8_t ip6oj_len; u_int8_t ip6oj_jumbo_len[4]; } __packed; #define IP6OPT_JUMBO_LEN 6 /* NSAP Address Option */ struct ip6_opt_nsap { u_int8_t ip6on_type; u_int8_t ip6on_len; u_int8_t ip6on_src_nsap_len; u_int8_t ip6on_dst_nsap_len; /* followed by source NSAP */ /* followed by destination NSAP */ } __packed; /* Tunnel Limit Option */ struct ip6_opt_tunnel { u_int8_t ip6ot_type; u_int8_t ip6ot_len; u_int8_t ip6ot_encap_limit; } __packed; /* Router Alert Option */ struct ip6_opt_router { u_int8_t ip6or_type; u_int8_t ip6or_len; u_int8_t ip6or_value[2]; } __packed; /* Router alert values (in network byte order) */ #if BYTE_ORDER == BIG_ENDIAN #define IP6_ALERT_MLD 0x0000 #define IP6_ALERT_RSVP 0x0001 #define IP6_ALERT_AN 0x0002 #else #if BYTE_ORDER == LITTLE_ENDIAN #define IP6_ALERT_MLD 0x0000 #define IP6_ALERT_RSVP 0x0100 #define IP6_ALERT_AN 0x0200 #endif /* LITTLE_ENDIAN */ #endif /* Routing header */ struct ip6_rthdr { u_int8_t ip6r_nxt; /* next header */ u_int8_t ip6r_len; /* length in units of 8 octets */ u_int8_t ip6r_type; /* routing type */ u_int8_t ip6r_segleft; /* segments left */ /* followed by routing type specific data */ } __packed; /* Type 0 Routing header, deprecated by RFC 5095. */ struct ip6_rthdr0 { u_int8_t ip6r0_nxt; /* next header */ u_int8_t ip6r0_len; /* length in units of 8 octets */ u_int8_t ip6r0_type; /* always zero */ u_int8_t ip6r0_segleft; /* segments left */ u_int32_t ip6r0_reserved; /* reserved field */ /* followed by up to 127 struct in6_addr */ } __packed; /* Fragment header */ struct ip6_frag { u_int8_t ip6f_nxt; /* next header */ u_int8_t ip6f_reserved; /* reserved field */ u_int16_t ip6f_offlg; /* offset, reserved, and flag */ u_int32_t ip6f_ident; /* identification */ } __packed; #if BYTE_ORDER == BIG_ENDIAN #define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ #define IP6F_RESERVED_MASK 0x0006 /* reserved bits in ip6f_offlg */ #define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ #else /* BYTE_ORDER == LITTLE_ENDIAN */ #define IP6F_OFF_MASK 0xf8ff /* mask out offset from _offlg */ #define IP6F_RESERVED_MASK 0x0600 /* reserved bits in ip6f_offlg */ #define IP6F_MORE_FRAG 0x0100 /* more-fragments flag */ #endif /* BYTE_ORDER == LITTLE_ENDIAN */ /* * Internet implementation parameters. */ #define IPV6_MAXHLIM 255 /* maximum hoplimit */ #define IPV6_DEFHLIM 64 /* default hlim */ #define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */ #define IPV6_HLIMDEC 1 /* subtracted when forwarding */ #define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ #define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ #define IPV6_MAXOPTHDR 2048 /* max option header size, 256 64-bit words */ #endif /* not _NETINET_IP6_H_ */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index eda41d36ab88..89e70df48774 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4074 +1,4074 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #include #include #include const int tcprexmtthresh = 3; VNET_DEFINE(int, tcp_log_in_vain) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_prr_conservative) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr_conservative), 0, "Do conservative Proportional Rate Reduction"); VNET_DEFINE(int, tcp_do_prr) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, "Enable Proportional Rate Reduction per RFC 6937"); VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, "Enable New Congestion Window Validation per RFC7661"); VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc6675_pipe), 0, "Use calculated pipe/in-flight bytes per RFC 6675"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 2; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); static void tcp_vnet_init(const void *unused) { COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_init, NULL); #ifdef VIMAGE static void tcp_vnet_uninit(const void *unused) { COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); } VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, tcp_vnet_uninit, NULL); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The first argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_add(int statnum, int val) { counter_u64_add(VNET(tcpstat)[statnum], val); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { #ifdef STATS int32_t gput; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)tp->snd_cwnd) - tp->snd_wnd); if (!IN_RECOVERY(tp->t_flags)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Compute goodput in bits per millisecond. */ gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an API * to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; } #endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += tp->ccv->bytes_this_ack; if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags) || /* * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->ccv->flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: /* FALLTHROUGH */ case IPTOS_ECN_ECT1: /* FALLTHROUGH */ case IPTOS_ECN_NOTECT: tp->ccv->flags &= ~CCF_IPHDR_CE; break; } if (th->th_flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) { tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); tp->t_flags |= TF_ACKNOW; } } } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { m = m_pullup(m, *offp + sizeof(struct tcphdr)); if (m == NULL) { *mp = m; TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); *mp = NULL; return (IPPROTO_DONE); } *mp = m; return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ int tcp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } - iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + iptos = IPV6_TRAFFIC_CLASS(ip6); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { if (m->m_len < off0 + off) { m = m_pullup(m, off0 + off); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK_ASSERT(inp); /* * While waiting for inp lock during the lookup, another thread * can have dropped the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. */ if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); inp = NULL; goto findpcb; } if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && ((inp->inp_socket == NULL) || (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb * lock, and not the inpcbinfo write lock. If so, we need to try to * acquire it, or if that fails, acquire a reference on the inpcb, * drop all locks, acquire a global write lock, and then re-acquire * the inpcb lock. We may at that point discover that another thread * has tried to free the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. * * XXXRW: It may be time to rethink timewait locking. */ if (inp->inp_flags & INP_TIMEWAIT) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), ("%s: so accepting but tp %p not listening", __func__, tp)); if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ rstreason = syncache_expand(&inc, &to, th, &so, m); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th, m); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos)) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) INP_WUNLOCK(inp); drop: INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during 1/2 of an sRTT * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * * If all of the criteria are met we increaset the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. * * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } void tcp_handle_wakeup(struct tcpcb *tp, struct socket *so) { /* * Since tp might be gone if the session entered * the TIME_WAIT state before coming here, we need * to check if the socket is still connected. */ if ((so->so_state & SS_ISCONNECTED) == 0) return; INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_WAKESOR) { tp->t_flags &= ~TF_WAKESOR; SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); sorwakeup(so); } if (tp->t_flags & TF_WAKESOW) { tp->t_flags &= ~TF_WAKESOW; SOCKBUF_UNLOCK_ASSERT(&so->so_snd); sowwakeup(so); } } void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; u_int maxseg; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. */ if (tp->t_flags2 & TF2_ECN_PERMIT) { if (thflags & TH_CWR) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; tp->t_flags |= TF_ACKNOW; } switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; else if (tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { /* Handle parallel SYN for ECN */ if (!(thflags & TH_ACK) && ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { tp->t_flags2 |= TF2_ECN_PERMIT; tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_shs); } if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else tp->t_flags &= ~TF_REQ_SCALE; /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (!(to.to_flags & TOF_SACKPERM) || (tp->t_flags & TF_NOOPT))) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if ((to.to_flags & TOF_FASTOPEN) && !(tp->t_flags & TF_NOOPT)) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } goto drop; } } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->t_flags |= TF_WAKESOW; if (sbavail(&so->so_snd)) (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && (V_tcp_do_ecn == 1)) { tp->t_flags2 |= TF2_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) incforsyn = 1; /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) sack_changed = tcp_sack_doack(tp, &to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || ((tp->t_flags & TF_SACK_PERMIT) && !sack_changed)) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags) && (tp->t_flags & TF_SACK_PERMIT)) { int snd_cnt = 0, limit = 0; int del_data = 0, pipe = 0; /* * In a duplicate ACK del_data is only the * diff_in_sack. If no SACK is used del_data * will be 0. Pipe is the amount of data we * estimate to be in the network. */ del_data = tp->sackhint.delivered_data; if (V_tcp_do_rfc6675_pipe) pipe = tcp_compute_pipe(tp); else pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; tp->sackhint.prr_delivered += del_data; if (pipe >= tp->snd_ssthresh) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = imax(1, tp->snd_nxt - tp->snd_una); snd_cnt = howmany((long)tp->sackhint.prr_delivered * tp->snd_ssthresh, tp->sackhint.recover_fs) - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)); } else { if (V_tcp_do_prr_conservative) limit = tp->sackhint.prr_delivered - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)); else limit = imax(tp->sackhint.prr_delivered - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)), del_data) + maxseg; snd_cnt = imin(tp->snd_ssthresh - pipe, limit); } snd_cnt = imax(snd_cnt, 0) / maxseg; /* * Send snd_cnt new data into the network in * response to this ACK. If there is a going * to be a SACK retransmission, adjust snd_cwnd * accordingly. */ tp->snd_cwnd = imax(maxseg, tp->snd_nxt - tp->snd_recover + tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg)); } else if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh || (tp->t_flags & TF_SACK_PERMIT && V_tcp_do_rfc6675_pipe && tp->sackhint.sacked_bytes > (tcprexmtthresh - 1) * maxseg)) { enter_recovery: /* * Above is the RFC6675 trigger condition of * more than (dupthresh-1)*maxseg sacked data. * If the count of holes in the * scoreboard is >= dupthresh, we could * also enter loss recovery, but don't * have that value readily available. */ tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, or prr, check * to see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (V_tcp_do_prr || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* * snd_ssthresh is already updated by * cc_cong_signal. */ tp->sackhint.prr_delivered = tp->sackhint.sacked_bytes; tp->sackhint.sack_bytes_rexmit = 0; tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->snd_recover = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); if (SEQ_GT(th->th_ack, tp->snd_una)) goto resume_partialack; goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. The variable * sack_changed tracks all changes to the SACK * scoreboard, including when partial ACKs without * SACK options are received, and clear the scoreboard * from the left side. Such partial ACKs should not be * counted as dupacks here. */ if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACK) && sack_changed) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && (tp->sackhint.sacked_bytes > ((tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp))))) { goto enter_recovery; } } } resume_partialack: KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) if (V_tcp_do_prr && to.to_flags & TOF_SACK) tcp_prr_partialack(tp, th); else tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Adjust for the SYN bit in sequence space, * but don't account for it in cwnd calculations. * This is for the SYN_RECEIVED, non-simultaneous * SYN case. SYN_SENT and simultaneous SYN are * treated elsewhere. */ if (incforsyn) tp->snd_una++; acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } SOCKBUF_UNLOCK(&so->so_snd); tp->t_flags |= TF_WAKESOW; m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* The socket upcall is handled by socantrcvmore. */ tp->t_flags &= ~TF_WAKESOR; /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tcp_twstart(tp); return; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); check_delack: INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) { tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); } m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; if (th->th_flags & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt * 1000 / hz)); #endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } void tcp_prr_partialack(struct tcpcb *tp, struct tcphdr *th) { int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* * Compute the amount of data that this ACK is indicating * (del_data) and an estimate of how many bytes are in the * network. */ del_data = tp->sackhint.delivered_data; if (V_tcp_do_rfc6675_pipe) pipe = tcp_compute_pipe(tp); else pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; tp->sackhint.prr_delivered += del_data; /* * Proportional Rate Reduction */ if (pipe >= tp->snd_ssthresh) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = imax(1, tp->snd_nxt - tp->snd_una); snd_cnt = howmany((long)tp->sackhint.prr_delivered * tp->snd_ssthresh, tp->sackhint.recover_fs) - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)); } else { if (V_tcp_do_prr_conservative) limit = tp->sackhint.prr_delivered - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)); else limit = imax(tp->sackhint.prr_delivered - (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->snd_recover)), del_data) + maxseg; snd_cnt = imin((tp->snd_ssthresh - pipe), limit); } snd_cnt = imax(snd_cnt, 0) / maxseg; /* * Send snd_cnt new data into the network in response to this ack. * If there is going to be a SACK retransmission, adjust snd_cwnd * accordingly. */ tp->snd_cwnd = imax(maxseg, tp->snd_nxt - tp->snd_recover + tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg)); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } uint32_t tcp_compute_initwnd(uint32_t maxseg) { /* * Calculate the Initial Window, also used as Restart Window * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. */ if (V_tcp_initcwnd_segments) return min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) return min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) return (2 * maxseg); else if (maxseg > 1095) return (3 * maxseg); else return (4 * maxseg); } } diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index b4d64b8a6893..62f6ad57c0f5 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1,2106 +1,2106 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif static void tcp_lro_rx_done(struct lro_ctrl *lc); static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); static long tcplro_stacks_wanting_mbufq = 0; counter_u64_t tcp_inp_lro_direct_queue; counter_u64_t tcp_inp_lro_wokeup_queue; counter_u64_t tcp_inp_lro_compressed; counter_u64_t tcp_inp_lro_single_push; counter_u64_t tcp_inp_lro_locks_taken; counter_u64_t tcp_inp_lro_sack_wake; counter_u64_t tcp_extra_mbuf; counter_u64_t tcp_would_have_but; counter_u64_t tcp_comp_total; counter_u64_t tcp_uncomp_total; counter_u64_t tcp_csum_hardware; counter_u64_t tcp_csum_hardware_w_ph; counter_u64_t tcp_csum_software; static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, &tcp_would_have_but, "Number of times we would have had an extra compressed but out of room"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw, CTLFLAG_RD, &tcp_csum_hardware, "Number of checksums processed in hardware"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw_ph, CTLFLAG_RD, &tcp_csum_hardware_w_ph, "Number of checksums processed in hardware with pseudo header"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_sw, CTLFLAG_RD, &tcp_csum_software, "Number of checksums processed in software"); void tcp_lro_reg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); } void tcp_lro_dereg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); } static __inline void tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { LIST_REMOVE(le, next); /* active list */ LIST_REMOVE(le, hash_next); /* hash bucket */ } int tcp_lro_init(struct lro_ctrl *lc) { return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); } int tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; size_t size; unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_mbuf_count = 0; lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; lc->lro_length_lim = TCP_LRO_LENGTH_MAX; lc->ifp = ifp; LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ if (lro_entries > lro_mbufs) elements = lro_entries; else elements = lro_mbufs; lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); lc->lro_mbuf_data = (struct lro_mbuf_sort *) malloc(size, M_LRO, M_NOWAIT | M_ZERO); /* check for out of memory */ if (lc->lro_mbuf_data == NULL) { free(lc->lro_hash, M_LRO); memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute offset for LRO entries */ le = (struct lro_entry *) (lc->lro_mbuf_data + lro_mbufs); /* setup linked list */ for (i = 0; i != lro_entries; i++) LIST_INSERT_HEAD(&lc->lro_free, le + i, next); return (0); } static struct tcphdr * tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) { struct ether_header *eh; struct tcphdr *th = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif eh = mtod(m, struct ether_header *); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(eh + 1); th = (struct tcphdr *)(ip6 + 1); break; #endif #ifdef INET case ETHERTYPE_IP: ip4 = (struct ip *)(eh + 1); th = (struct tcphdr *)(ip4 + 1); break; #endif } return (th); } static void lro_free_mbuf_chain(struct mbuf *m) { struct mbuf *save; while (m) { save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = save; } } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; unsigned x; /* reset LRO free list */ LIST_INIT(&lc->lro_free); /* free active mbufs, if any */ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); lro_free_mbuf_chain(le->m_head); } /* free hash table */ free(lc->lro_hash, M_LRO); lc->lro_hash = NULL; lc->lro_hashsz = 0; /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); lc->lro_mbuf_count = 0; /* free allocated memory, if any */ free(lc->lro_mbuf_data, M_LRO); lc->lro_mbuf_data = NULL; } static uint16_t tcp_lro_csum_th(struct tcphdr *th) { uint32_t ch; uint16_t *p, l; ch = th->th_sum = 0x0000; l = th->th_off; p = (uint16_t *)th; while (l > 0) { ch += *p; p++; ch += *p; p++; l--; } while (ch > 0xffff) ch = (ch >> 16) + (ch & 0xffff); return (ch & 0xffff); } static uint16_t tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, uint16_t tcp_data_len, uint16_t csum) { uint32_t c; uint16_t cs; c = csum; /* Remove length from checksum. */ switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)l3hdr; if (le->append_cnt == 0) cs = ip6->ip6_plen; else { uint32_t cx; cx = ntohs(ip6->ip6_plen); cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); } break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; ip4 = (struct ip *)l3hdr; if (le->append_cnt == 0) cs = ip4->ip_len; else { cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), IPPROTO_TCP); cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, htons(cs)); } break; } #endif default: cs = 0; /* Keep compiler happy. */ } cs = ~cs; c += cs; /* Remove TCP header csum. */ cs = ~tcp_lro_csum_th(th); c += cs; while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c & 0xffff); } static void tcp_lro_rx_done(struct lro_ctrl *lc) { struct lro_entry *le; while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; struct timeval tv; if (LIST_EMPTY(&lc->lro_active)) return; getmicrouptime(&tv); timevalsub(&tv, timeout); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (timevalcmp(&tv, &le->mtime, >=)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } #ifdef INET6 static int tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, struct tcphdr **th) { /* XXX-BZ we should check the flow-label. */ /* XXX-BZ We do not yet support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Find the TCP header. */ *th = (struct tcphdr *)(ip6 + 1); return (0); } #endif #ifdef INET static int tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, struct tcphdr **th) { int csum_flags; uint16_t csum; if (ip4->ip_p != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Ensure there are no options. */ if ((ip4->ip_hl << 2) != sizeof (*ip4)) return (TCP_LRO_CANNOT); /* .. and the packet is not fragmented. */ if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) return (TCP_LRO_CANNOT); /* Legacy IP has a header checksum that needs to be correct. */ csum_flags = m->m_pkthdr.csum_flags; if (csum_flags & CSUM_IP_CHECKED) { if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } else { csum = in_cksum_hdr(ip4); if (__predict_false((csum) != 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } /* Find the TCP header (we assured there are no IP options). */ *th = (struct tcphdr *)(ip4 + 1); return (0); } #endif #ifdef TCPHPTS static void tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, uint32_t th_seq , uint32_t th_ack, uint16_t th_win) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; uint32_t cts; cts = tcp_get_usecs(&tv); memset(&log, 0, sizeof(union tcp_log_stackspecific)); log.u_bbr.flex8 = frm; log.u_bbr.flex1 = tcp_data_len; if (m) log.u_bbr.flex2 = m->m_pkthdr.len; else log.u_bbr.flex2 = 0; log.u_bbr.flex3 = le->append_cnt; log.u_bbr.flex4 = le->p_len; if (le->m_head) { log.u_bbr.flex5 = le->m_head->m_pkthdr.len; log.u_bbr.delRate = le->m_head->m_flags; log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; } log.u_bbr.inflight = th_seq; log.u_bbr.timeStamp = cts; log.u_bbr.epoch = le->next_seq; log.u_bbr.delivered = th_ack; log.u_bbr.lt_epoch = le->ack_seq; log.u_bbr.pacing_gain = th_win; log.u_bbr.cwnd_gain = le->window; log.u_bbr.cur_del_rate = (uintptr_t)m; log.u_bbr.bw_inuse = (uintptr_t)le->m_head; log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ log.u_bbr.applimited = le->ulp_csum; log.u_bbr.lost = le->mbuf_appended; log.u_bbr.pkt_epoch = le->cmp_ack_cnt; log.u_bbr.flex6 = tcp_tv_to_usectick(&lc->lro_last_flush); if (in_epoch(net_epoch_preempt)) log.u_bbr.inhpts = 1; else log.u_bbr.inhpts = 0; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_LRO, 0, 0, &log, false, &tv); } } #endif static void tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) { if (le->append_cnt > 1) { struct tcphdr *th; uint16_t p_len; p_len = htons(le->p_len); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = le->le_ip6; ip6->ip6_plen = p_len; th = (struct tcphdr *)(ip6 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->p_len += ETHER_HDR_LEN + sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; uint32_t cl; uint16_t c; ip4 = le->le_ip4; /* Fix IP header checksum for new length. */ c = ~ip4->ip_sum; cl = c; c = ~ip4->ip_len; cl += c + p_len; while (cl > 0xffff) cl = (cl >> 16) + (cl & 0xffff); c = cl; ip4->ip_sum = ~c; ip4->ip_len = p_len; th = (struct tcphdr *)(ip4 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->p_len += ETHER_HDR_LEN; break; } #endif default: th = NULL; /* Keep compiler happy. */ } le->m_head->m_pkthdr.csum_data = 0xffff; le->m_head->m_pkthdr.len = le->p_len; /* Incorporate the latest ACK into the TCP header. */ th->th_ack = le->ack_seq; th->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(th + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } /* Update the TCP header checksum. */ le->ulp_csum += p_len; le->ulp_csum += tcp_lro_csum_th(th); while (le->ulp_csum > 0xffff) le->ulp_csum = (le->ulp_csum >> 16) + (le->ulp_csum & 0xffff); th->th_sum = (le->ulp_csum & 0xffff); th->th_sum = ~th->th_sum; } /* * Break any chain, this is not set to NULL on the singleton * case m_nextpkt points to m_head. Other case set them * m_nextpkt to NULL in push_and_replace. */ le->m_head->m_nextpkt = NULL; le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; (*lc->ifp->if_input)(lc->ifp, le->m_head); lc->lro_queued += le->append_cnt; } static void tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) { struct ether_header *eh; void *l3hdr = NULL; /* Keep compiler happy. */ struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif uint32_t *ts_ptr; int error, l, ts_failed = 0; uint16_t tcp_data_len; uint16_t csum; error = -1; eh = mtod(m, struct ether_header *); /* * We must reset the other pointers since the mbuf * we were pointing too is about to go away. */ switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); error = tcp_lro_rx_ipv6(lc, m, ip6, &th); le->le_ip6 = ip6; le->source_ip6 = ip6->ip6_src; le->dest_ip6 = ip6->ip6_dst; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); break; #endif #ifdef INET case ETHERTYPE_IP: l3hdr = ip4 = (struct ip *)(eh + 1); error = tcp_lro_rx_ipv4(lc, m, ip4, &th); le->le_ip4 = ip4; le->source_ip4 = ip4->ip_src.s_addr; le->dest_ip4 = ip4->ip_dst.s_addr; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif } KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", __func__, le)); ts_ptr = (uint32_t *)(th + 1); l = (th->th_off << 2); l -= sizeof(*th); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { /* We have failed to find a timestamp some other option? */ ts_failed = 1; } if ((l != 0) && (ts_failed == 0)) { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } else le->timestamp = 0; le->source_port = th->th_sport; le->dest_port = th->th_dport; /* Pull out the csum */ tcp_data_len = m->m_pkthdr.lro_len; le->next_seq = ntohl(th->th_seq) + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; csum = th->th_sum; /* Setup the data pointers */ le->m_head = m; le->m_tail = m_last(m); le->append_cnt = 0; le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); le->append_cnt++; th->th_sum = csum; /* Restore checksum on first packet. */ } static void tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) { /* * Push up the stack the current le and replace * it with m. */ struct mbuf *msave; /* Grab off the next and save it */ msave = le->m_head->m_nextpkt; le->m_head->m_nextpkt = NULL; /* Now push out the old le entry */ tcp_flush_out_le(tp, lc, le); /* * Now to replace the data properly in the le * we have to reset the tcp header and * other fields. */ tcp_set_le_to_m(lc, le, m); /* Restore the next list */ m->m_nextpkt = msave; } static void tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) { /* * Walk through the mbuf chain we * have on tap and compress/condense * as required. */ uint32_t *ts_ptr; struct mbuf *m; struct tcphdr *th; uint16_t tcp_data_len, csum_upd; int l; /* * First we must check the lead (m_head) * we must make sure that it is *not* * something that should be sent up * right away (sack etc). */ again: m = le->m_head->m_nextpkt; if (m == NULL) { /* Just the one left */ return; } if (m->m_flags & M_ACKCMP) panic("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", lc, le, m); th = tcp_lro_get_th(le, le->m_head); KASSERT(th != NULL, ("le:%p m:%p th comes back NULL?", le, le->m_head)); l = (th->th_off << 2); l -= sizeof(*th); ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { /* * Its not the timestamp. We can't * use this guy as the head. */ le->m_head->m_nextpkt = m->m_nextpkt; tcp_push_and_replace(tp, lc, le, m); goto again; } if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { /* * Make sure that previously seen segements/ACKs are delivered * before this segment, e.g. FIN. */ le->m_head->m_nextpkt = m->m_nextpkt; KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); tcp_push_and_replace(tp, lc, le, m); goto again; } while((m = le->m_head->m_nextpkt) != NULL) { /* * condense m into le, first * pull m out of the list. */ KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); KASSERT(((m->m_flags & M_ACKCMP) == 0), ("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", lc, le, m)); le->m_head->m_nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; /* Setup my data */ tcp_data_len = m->m_pkthdr.lro_len; th = tcp_lro_get_th(le, m); KASSERT(th != NULL, ("le:%p m:%p th comes back NULL?", le, m)); ts_ptr = (uint32_t *)(th + 1); l = (th->th_off << 2); l -= sizeof(*th); if (le->append_cnt >= lc->lro_ackcnt_lim) { tcp_push_and_replace(tp, lc, le, m); goto again; } if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { /* Flush now if appending will result in overflow. */ tcp_push_and_replace(tp, lc, le, m); goto again; } if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { /* * Maybe a sack in the new one? We need to * start all over after flushing the * current le. We will go up to the beginning * and flush it (calling the replace again possibly * or just returning). */ tcp_push_and_replace(tp, lc, le, m); goto again; } if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { tcp_push_and_replace(tp, lc, le, m); goto again; } if (l != 0) { uint32_t tsval = ntohl(*(ts_ptr + 1)); /* Make sure timestamp values are increasing. */ if (TSTMP_GT(le->tsval, tsval)) { tcp_push_and_replace(tp, lc, le, m); goto again; } le->tsval = tsval; le->tsecr = *(ts_ptr + 2); } /* Try to append the new segment. */ if (__predict_false(ntohl(th->th_seq) != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack && le->window == th->th_win))) { /* Out of order packet or duplicate ACK. */ tcp_push_and_replace(tp, lc, le, m); goto again; } if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { le->next_seq += tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; } else if (th->th_ack == le->ack_seq) { le->window = WIN_MAX(le->window, th->th_win); } csum_upd = m->m_pkthdr.lro_csum; le->ulp_csum += csum_upd; if (tcp_data_len == 0) { le->append_cnt++; le->mbuf_cnt--; m_freem(m); continue; } le->append_cnt++; le->mbuf_appended++; le->p_len += tcp_data_len; /* * Adjust the mbuf so that m_data points to the first byte of * the ULP payload. Adjust the mbuf to avoid complications and * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); } } #ifdef TCPHPTS static void tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) { if (tp->t_in_pkt == NULL) { /* Nothing yet there */ tp->t_in_pkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } else { /* Already some there */ tp->t_tail_pkt->m_nextpkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } le->m_head = NULL; le->m_last_mbuf = NULL; } static struct mbuf * tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp) { struct mbuf *m = NULL; struct tcpcb *tp; tp = intotcpcb(inp); if (tp) { /* Look at the last mbuf if any in queue */ if ((tp->t_tail_pkt) && (tp->t_tail_pkt->m_flags & M_ACKCMP)) { if (M_TRAILINGSPACE(tp->t_tail_pkt) >= sizeof(struct tcp_ackent)) { tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); m = tp->t_tail_pkt; } else { if ((inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { counter_u64_add(tcp_would_have_but, 1); inp->inp_flags2 |= INP_MBUF_L_ACKS; } } } } return (m); } static struct inpcb * tcp_lro_lookup(struct lro_ctrl *lc, struct lro_entry *le) { struct inpcb *inp = NULL; NET_EPOCH_ASSERT(); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, le->source_port, &le->dest_ip6,le->dest_port, INPLOOKUP_WLOCKPCB, lc->ifp); break; #endif #ifdef INET case ETHERTYPE_IP: inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, le->source_port, le->le_ip4->ip_dst, le->dest_port, INPLOOKUP_WLOCKPCB, lc->ifp); break; #endif } return (inp); } #endif #ifdef NO static void stack_guard_prep(uint32_t *sg, int len) { int i; for (i = 0; i < len; i++) { sg[i] = 0xdeadc0de; } } static void stack_guard_check(struct lro_ctrl *lc, struct lro_entry *le, uint32_t *sg, int len) { int i; for (i = 0; i < len; i++) { if (sg[i] != 0xdeadc0de) panic("Stack guard fails sg[%d] = 0x%x le:%p lc:%p sg:%p\n", i, sg[i], le, lc, sg); } } #endif void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { struct tcpcb *tp = NULL; #ifdef TCPHPTS struct inpcb *inp = NULL; int need_wakeup = 0, can_queue = 0; /* Now lets lookup the inp first */ CURVNET_SET(lc->ifp->if_vnet); /* * XXXRRS Currently the common input handler for * mbuf queuing cannot handle VLAN Tagged. This needs * to be fixed and the or condition removed (i.e. the * common code should do the right lookup for the vlan * tag and anything else that the vlan_input() does). */ if (le->m_head == NULL) { /* * Everything was pushed up to the stack nothing to do * but release the reference and be done. */ if (le->inp) { INP_WLOCK(le->inp); if (in_pcbrele_wlocked(le->inp) == 0) { /* * We released it and still * have the lock. */ INP_WUNLOCK(le->inp); } } goto done; } if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG)) goto skip_lookup; if (le->inp == NULL) { le->inp = inp = tcp_lro_lookup(lc, le); if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || (inp->inp_flags2 & INP_FREED))) { /* * We can't present these to the inp since * it will not support the stripped ethernet * header that these have nor if a compressed * ack is presnet. */ INP_WUNLOCK(inp); lro_free_mbuf_chain(le->m_head); goto done; } if ((le->flags & HAS_COMP_ENTRIES) && ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { /* * It swapped to off, must be a stack * switch. We need to ditch all the packets * and the peer will just have to retransmit. */ INP_WUNLOCK(inp); lro_free_mbuf_chain(le->m_head); goto done; } } else { /* We have a reference on the inp lets lock and release it */ inp = le->inp; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { /* * We lost the inp. We can't present these to the inp since * it will not support the stripped off etherent header. */ lro_free_mbuf_chain(le->m_head); goto done; } if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || (inp->inp_flags2 & INP_FREED))) { /* * We can't present these to the inp since * it may not support them. */ INP_WUNLOCK(inp); lro_free_mbuf_chain(le->m_head); goto done; } if ((le->flags & HAS_COMP_ENTRIES) && ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { /* * It swapped to off, must be a stack * switch. We need to ditch all the packets * and the peer will just have to retransmit. */ INP_WUNLOCK(inp); lro_free_mbuf_chain(le->m_head); goto done; } } if (inp && ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) || (inp->inp_flags2 & INP_MBUF_ACKCMP))) { /* The transport supports mbuf queuing */ can_queue = 1; if (le->need_wakeup || ((inp->inp_in_input == 0) && ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { /* * Either the transport is off on a keep-alive * (it has the queue_ready flag clear and its * not already been woken) or the entry has * some urgent thing (FIN or possibly SACK blocks). * This means we need to wake the transport up by * putting it on the input pacer. */ need_wakeup = 1; if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && (le->need_wakeup != 1)) { /* * Prohibited from a sack wakeup. */ need_wakeup = 0; } } /* Do we need to be awoken due to lots of data or acks? */ if ((le->tcp_tot_p_len >= lc->lro_length_lim) || (le->mbuf_cnt >= lc->lro_ackcnt_lim)) need_wakeup = 1; } if (inp) tp = intotcpcb(inp); else tp = NULL; if (can_queue) { counter_u64_add(tcp_inp_lro_direct_queue, 1); tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, inp->inp_flags2, inp->inp_in_input, le->need_wakeup); tcp_queue_pkts(tp, le); if (need_wakeup) { /* * We must get the guy to wakeup via * hpts. */ NET_EPOCH_ASSERT(); if (le->need_wakeup == 2) { /* * The value 2 is set if the * options are unrecognized i.e. * not just a timestamp. So really * sack is usually what it is but * it might be some other option (CWR * etc). */ counter_u64_add(tcp_inp_lro_sack_wake, 1); } counter_u64_add(tcp_inp_lro_wokeup_queue, 1); if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) { inp = NULL; } } } if (inp) { /* Unlock it */ tp = NULL; counter_u64_add(tcp_inp_lro_locks_taken, 1); INP_WUNLOCK(inp); } if (can_queue == 0) { skip_lookup: if (le->strip_cnt) { /* * We have stripped mbufs, the connection * must have changed underneath us. You * loose the packets as a penalty. */ lro_free_mbuf_chain(le->m_head); goto done; } #endif /* TCPHPTS */ /* Old fashioned lro method */ if (le->m_head != le->m_last_mbuf) { counter_u64_add(tcp_inp_lro_compressed, 1); tcp_lro_condense(tp, lc, le); } else counter_u64_add(tcp_inp_lro_single_push, 1); tcp_flush_out_le(tp, lc, le); #ifdef TCPHPTS } done: CURVNET_RESTORE(); #endif lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); } #ifdef HAVE_INLINE_FLSLL #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) #else static inline uint64_t tcp_lro_msb_64(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); return (x & ~(x >> 1)); } #endif /* * The tcp_lro_sort() routine is comparable to qsort(), except it has * a worst case complexity limit of O(MIN(N,64)*N), where N is the * number of elements to sort and 64 is the number of sequence bits * available. The algorithm is bit-slicing the 64-bit sequence number, * sorting one bit at a time from the most significant bit until the * least significant one, skipping the constant bits. This is * typically called a radix sort. */ static void tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) { struct lro_mbuf_sort temp; uint64_t ones; uint64_t zeros; uint32_t x; uint32_t y; repeat: /* for small arrays insertion sort is faster */ if (size <= 12) { for (x = 1; x < size; x++) { temp = parray[x]; for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) parray[y] = parray[y - 1]; parray[y] = temp; } return; } /* compute sequence bits which are constant */ ones = 0; zeros = 0; for (x = 0; x != size; x++) { ones |= parray[x].seq; zeros |= ~parray[x].seq; } /* compute bits which are not constant into "ones" */ ones &= zeros; if (ones == 0) return; /* pick the most significant bit which is not constant */ ones = tcp_lro_msb_64(ones); /* * Move entries having cleared sequence bits to the beginning * of the array: */ for (x = y = 0; y != size; y++) { /* skip set bits */ if (parray[y].seq & ones) continue; /* swap entries */ temp = parray[x]; parray[x] = parray[y]; parray[y] = temp; x++; } KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); /* sort zeros */ tcp_lro_sort(parray, x); /* sort ones */ parray += x; size -= x; goto repeat; } void tcp_lro_flush_all(struct lro_ctrl *lc) { uint64_t seq; uint64_t nseq; unsigned x; /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; microuptime(&lc->lro_last_flush); /* sort all mbufs according to stream */ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); /* input data into LRO engine, stream by stream */ seq = 0; for (x = 0; x != lc->lro_mbuf_count; x++) { struct mbuf *mb; /* get mbuf */ mb = lc->lro_mbuf_data[x].mb; /* get sequence number, masking away the packet index */ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); /* check for new stream */ if (seq != nseq) { seq = nseq; /* flush active streams */ tcp_lro_rx_done(lc); } /* add packet to LRO engine */ if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; lc->lro_flushed++; } } done: /* flush active streams */ tcp_lro_rx_done(lc); lc->lro_mbuf_count = 0; } static void lro_set_mtime(struct timeval *tv, struct timespec *ts) { tv->tv_sec = ts->tv_sec; tv->tv_usec = ts->tv_nsec / 1000; } #ifdef TCPHPTS static void build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint16_t hdr_len, uint16_t iptos) { /* * Given a TCP ack, summarize it down into the small tcp * ack entry. */ u_char *cp; KASSERT(((th->th_flags & ~(TH_ACK | TH_PUSH | TH_CWR | TH_ECE)) == 0), ("tcphdr:%p mbuf:%p has unallowed bits %x", th, m, th->th_flags)); ae->timestamp = m->m_pkthdr.rcv_tstmp; if (m->m_flags & M_TSTMP_LRO) ae->flags = TSTMP_LRO; else if (m->m_flags & M_TSTMP) ae->flags = TSTMP_HDWR; ae->seq = ntohl(th->th_seq); ae->ack = ntohl(th->th_ack); ae->flags |= th->th_flags; if (hdr_len) { /* We have a timestamp options get out the bits */ cp = (u_char *)(th + 1); /* Skip the two NOP's at the front */ while (*cp == TCPOPT_NOP) cp++; KASSERT(((*cp == TCPOPT_TIMESTAMP) && (cp[1] == TCPOLEN_TIMESTAMP)), ("At %p in tcphdr:%p options of %d not timestamp", cp, th, hdr_len)); bcopy((char *)cp + 2, (char *)&ae->ts_value, sizeof(uint32_t)); ae->ts_value = ntohl(ae->ts_value); bcopy((char *)cp + 6, (char *)&ae->ts_echo, sizeof(uint32_t)); ae->ts_echo = ntohl(ae->ts_echo); ae->flags |= HAS_TSTMP; } ae->win = ntohs(th->th_win); ae->codepoint = iptos; } static struct mbuf * do_bpf_and_csum(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, struct ether_header *eh, struct mbuf *m, int bpf_req, int locked) { /* * Do TCP/IP checksum and BPF tap for either ACK_CMP packets or * MBUF QUEUE type packets. */ struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip = NULL; /* Keep compiler happy. */ #endif uint16_t drop_hdrlen; int etype, tlen; #ifdef INET uint8_t iptos; #endif /* Let the BPF see the packet */ if (bpf_req && lc->ifp) ETHER_BPF_MTAP(lc->ifp, m); /* Get type and Trim off the ethernet header */ m->m_pkthdr.lro_etype = etype = ntohs(eh->ether_type); m_adj(m, sizeof(*eh)); m->m_flags |= M_LRO_EHDRSTRP; switch (etype) { #ifdef INET6 case ETHERTYPE_IPV6: { if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); m_freem(m); return (NULL); } } ip6 = (struct ip6_hdr *)(eh + 1); th = (struct tcphdr *)(ip6 + 1); tlen = ntohs(ip6->ip6_plen); drop_hdrlen = sizeof(*ip6); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { counter_u64_add(tcp_csum_hardware_w_ph, 1); th->th_sum = m->m_pkthdr.csum_data; } else { counter_u64_add(tcp_csum_hardware, 1); th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); } th->th_sum ^= 0xffff; } else { counter_u64_add(tcp_csum_software, 1); th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); if (locked) { /* Log the bad news */ struct tcpcb *tp = intotcpcb(inp); tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); } m_freem(m); return (NULL); } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ m_freem(m); return (NULL); } break; } #endif #ifdef INET case ETHERTYPE_IP: { if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); m_freem(m); return (NULL); } } ip = (struct ip *)(eh + 1); th = (struct tcphdr *)(ip + 1); iptos = ip->ip_tos; drop_hdrlen = sizeof(*ip); tlen = ntohs(ip->ip_len) - sizeof(struct ip); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { counter_u64_add(tcp_csum_hardware_w_ph, 1); th->th_sum = m->m_pkthdr.csum_data; } else { counter_u64_add(tcp_csum_hardware, 1); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); } th->th_sum ^= 0xffff; } else { int len; struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ counter_u64_add(tcp_csum_software, 1); len = drop_hdrlen + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); if (locked) { /* Log the bad news */ struct tcpcb *tp = intotcpcb(inp); tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); } m_freem(m); return (NULL); } break; } #endif } /* end switch */ return (m); } #endif static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif struct tcphdr *th; void *l3hdr = NULL; /* Keep compiler happy. */ uint32_t *ts_ptr; tcp_seq seq; int error, ip_len, hdr_len, locked = 0; uint16_t eh_type, tcp_data_len, need_flush; #ifdef TCPHPTS uint16_t iptos; #endif struct lro_head *bucket; struct timespec arrv; /* Clear the flags we may use to communicate with TCP */ m->m_flags &= ~(M_ACKCMP|M_LRO_EHDRSTRP); /* We expect a contiguous header [eh, ip, tcp]. */ if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { /* If no hardware or arrival stamp on the packet add arrival */ nanouptime(&arrv); m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; m->m_flags |= M_TSTMP_LRO; } eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { CURVNET_SET(lc->ifp->if_vnet); if (V_ip6_forwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); error = tcp_lro_rx_ipv6(lc, m, ip6, &th); if (error != 0) return (error); tcp_data_len = ntohs(ip6->ip6_plen); #ifdef TCPHPTS - iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + iptos = IPV6_TRAFFIC_CLASS(ip6); #endif ip_len = sizeof(*ip6) + tcp_data_len; break; } #endif #ifdef INET case ETHERTYPE_IP: { CURVNET_SET(lc->ifp->if_vnet); if (V_ipforwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip4 = (struct ip *)(eh + 1); error = tcp_lro_rx_ipv4(lc, m, ip4, &th); if (error != 0) return (error); ip_len = ntohs(ip4->ip_len); #ifdef TCPHPTS iptos = ip4->ip_tos; #endif tcp_data_len = ip_len - sizeof(*ip4); break; } #endif /* XXX-BZ what happens in case of VLAN(s)? */ default: return (TCP_LRO_NOT_SUPPORTED); } /* * If the frame is padded beyond the end of the IP packet, then we must * trim the extra bytes off. */ hdr_len = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); if (hdr_len != 0) { if (hdr_len < 0) /* Truncated packet. */ return (TCP_LRO_CANNOT); m_adj(m, -hdr_len); } /* * Check TCP header constraints. */ hdr_len = (th->th_off << 2); ts_ptr = (uint32_t *)(th + 1); tcp_data_len -= hdr_len; hdr_len -= sizeof(*th); if (th->th_flags & TH_SYN) return (TCP_LRO_CANNOT); if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { need_flush = 1; } else need_flush = 0; if (hdr_len != 0 && (__predict_false(hdr_len != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { /* * We have an option besides Timestamps, maybe * it is a sack (most likely) which means we * will probably need to wake up a sleeper (if * the guy does queueing). */ need_flush = 2; } /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) csum = th->th_sum; seq = ntohl(th->th_seq); if (!use_hash) { bucket = &lc->lro_hash[0]; } else if (M_HASHTYPE_ISHASH(m)) { bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; } else { uint32_t hash; switch (eh_type) { #ifdef INET case ETHERTYPE_IP: hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: hash = ip6->ip6_src.s6_addr32[0] + ip6->ip6_dst.s6_addr32[0]; hash += ip6->ip6_src.s6_addr32[1] + ip6->ip6_dst.s6_addr32[1]; hash += ip6->ip6_src.s6_addr32[2] + ip6->ip6_dst.s6_addr32[2]; hash += ip6->ip6_src.s6_addr32[3] + ip6->ip6_dst.s6_addr32[3]; break; #endif default: hash = 0; break; } hash += th->th_sport + th->th_dport; bucket = &lc->lro_hash[hash % lc->lro_hashsz]; } /* Try to find a matching previous segment. */ LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || le->dest_port != th->th_dport) continue; switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: if (bcmp(&le->source_ip6, &ip6->ip6_src, sizeof(struct in6_addr)) != 0 || bcmp(&le->dest_ip6, &ip6->ip6_dst, sizeof(struct in6_addr)) != 0) continue; break; #endif #ifdef INET case ETHERTYPE_IP: if (le->source_ip4 != ip4->ip_src.s_addr || le->dest_ip4 != ip4->ip_dst.s_addr) continue; break; #endif } if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || (th->th_ack == le->ack_seq)) { m->m_pkthdr.lro_len = tcp_data_len; } else { /* no data and old ack */ m_freem(m); return (0); } #ifdef TCPHPTS if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) goto skip_lookup_a; if (le->inp == NULL) { CURVNET_SET(lc->ifp->if_vnet); le->inp = tcp_lro_lookup(lc, le); if (le->inp) { in_pcbref(le->inp); locked = 1; } CURVNET_RESTORE(); } else if (le->inp) { INP_WLOCK(le->inp); locked = 1; } if (locked && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || (le->inp->inp_flags2 & INP_FREED))) { /* We can't present these to the inp since * its dead Jim. */ int ret; ret = in_pcbrele_wlocked(le->inp); if (ret == 0) INP_WUNLOCK(le->inp); le->inp = NULL; locked = 0; tcp_lro_active_remove(le); if (le->strip_cnt && le->m_head) { /* * If we have any stripped packets we * just dump the whole chain. The * tcp_lro_flush code knows how * to handle things when le->m_head is NULL * and even le->inp is NULL. */ lro_free_mbuf_chain(le->m_head); le->m_head = NULL; } tcp_lro_flush(lc, le); return (TCP_LRO_CANNOT); } /* See if it has been switched on */ if (le->inp && (le->inp->inp_flags2 & INP_MBUF_ACKCMP)) le->flags |= CAN_USE_ACKCMP; if ((need_flush == 1) && le->inp && (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { /* * For MBUF queuing or ACKCMP we can accept ECE and CWR * since each packet is sent to the transport (or the * compressed state including the ECN bits). */ need_flush = 0; } skip_lookup_a: #endif if (need_flush) le->need_wakeup = need_flush; /* Save of the data only csum */ m->m_pkthdr.rcvif = lc->ifp; m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); th->th_sum = csum; /* Restore checksum */ #ifdef TCPHPTS if ((le->flags & CAN_USE_ACKCMP) || (le->inp && (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { /* * Mbuf queued and ACKCMP packets have their BPF and csum * done here in LRO. They will still end up looking at the * headers and such (IP/TCP) but we don't want to proceed * with any bad csum! */ m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); if (m == NULL) { /* Bad csum, accounting already done */ if (locked) { INP_WUNLOCK(le->inp); } return (0); } le->strip_cnt++; } if ((need_flush == 0) && (th->th_flags & TH_ACK) && (tcp_data_len == 0) && (le->flags & CAN_USE_ACKCMP)) { /* * Ok this is a pure ack lets find out if our * last packet already has one of these. */ struct mbuf *nm; struct tcp_ackent *ack_ent; int idx; INP_WLOCK_ASSERT(le->inp); if (le->m_head == NULL) { /* Ok can we still use the end of the inp's? */ nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); if (nm == NULL) { /* gone or full */ goto new_one; } /* We can add in to the one on the tail */ ack_ent = mtod(nm, struct tcp_ackent *); idx = (nm->m_len / sizeof(struct tcp_ackent)); build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); /* Bump the size of both pkt-hdr and len */ nm->m_len += sizeof(struct tcp_ackent); nm->m_pkthdr.len += sizeof(struct tcp_ackent); le->ack_seq = th->th_ack; le->window = th->th_win; m_freem(m); counter_u64_add(tcp_extra_mbuf, 1); INP_WUNLOCK(le->inp); return (0); } else if (le->m_last_mbuf->m_flags & M_ACKCMP) { /* Yes we might be able to be appended to */ nm = le->m_last_mbuf; if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { if ((le->inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { counter_u64_add(tcp_would_have_but, 1); le->inp->inp_flags2 |= INP_MBUF_L_ACKS; } goto new_one; } /* we have room */ ack_ent = mtod(nm, struct tcp_ackent *); idx = (nm->m_len / sizeof(struct tcp_ackent)); build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); /* Bump the size of both pkt-hdr and len */ nm->m_len += sizeof(struct tcp_ackent); nm->m_pkthdr.len += sizeof(struct tcp_ackent); m_freem(m); le->flags |= HAS_COMP_ENTRIES; le->cmp_ack_cnt++; goto compressed; } else { /* Nope we need a new one */ new_one: if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); else { nm = m_gethdr(M_NOWAIT, MT_DATA); nm->m_flags |= M_ACKCMP; } if (nm) { nm->m_pkthdr.rcvif = lc->ifp; ack_ent = mtod(nm, struct tcp_ackent *); build_ack_entry(ack_ent, th, m, hdr_len, iptos); m_freem(m); m = nm; m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); le->flags |= HAS_COMP_ENTRIES; le->cmp_ack_cnt++; } /* We fall through and append */ } } if (m->m_flags & M_ACKCMP) { counter_u64_add(tcp_comp_total, 1); } else { counter_u64_add(tcp_uncomp_total, 1); } #endif /* Save off the tail I am appending too (prev) */ m->m_nextpkt = NULL; if (le->m_head == NULL) { /* * Case where we wer chaining off the inp * and now no-longer can. */ le->m_head = m; le->m_tail = m_last(m); le->m_last_mbuf = m; le->m_prev_last = NULL; } else { le->m_prev_last = le->m_last_mbuf; /* Mark me in the last spot */ le->m_last_mbuf->m_nextpkt = m; /* Now set the tail to me */ le->m_last_mbuf = m; le->tcp_tot_p_len += tcp_data_len; } #ifdef TCPHPTS compressed: #endif le->mbuf_cnt++; /* Add to the total size of data */ lro_set_mtime(&le->mtime, &arrv); if (locked) INP_WUNLOCK(le->inp); return (0); } /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); tcp_lro_active_insert(lc, bucket, le); lro_set_mtime(&le->mtime, &arrv); /* Start filling in details. */ switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: le->le_ip6 = ip6; le->source_ip6 = ip6->ip6_src; le->dest_ip6 = ip6->ip6_dst; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); break; #endif #ifdef INET case ETHERTYPE_IP: le->le_ip4 = ip4; le->source_ip4 = ip4->ip_src.s_addr; le->dest_ip4 = ip4->ip_dst.s_addr; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif } le->source_port = th->th_sport; le->dest_port = th->th_dport; le->next_seq = seq + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; if (hdr_len != 0) { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", __func__, le, le->ulp_csum)); le->append_cnt = 0; le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); le->append_cnt++; th->th_sum = csum; /* Restore checksum */ m->m_pkthdr.rcvif = lc->ifp; m->m_pkthdr.lro_len = tcp_data_len; le->mbuf_cnt = 1; le->cmp_ack_cnt = 0; le->flags = 0; #ifdef TCPHPTS /* * Lets find out if we can use the mbuf-compression. */ if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) goto skip_lookup_b; CURVNET_SET(lc->ifp->if_vnet); le->inp = tcp_lro_lookup(lc, le); if (le->inp && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || (le->inp->inp_flags2 & INP_FREED))) { INP_WUNLOCK(le->inp); le->inp = NULL; } if (le->inp) { if ((need_flush == 1) && (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { /* * For MBUF queuing or ACKCMP we can accept ECE and CWR * since each packet is sent to the transport (or the * compressed state including the ECN bits). */ need_flush = 0; } locked = 1; if (le->inp->inp_flags2 & INP_MBUF_ACKCMP) le->flags |= CAN_USE_ACKCMP; if ((le->flags & CAN_USE_ACKCMP) || (le->inp && (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); if (m == NULL) { /* Bad csum, accounting already done */ INP_WUNLOCK(le->inp); le->inp = NULL; return (0); } le->strip_cnt++; } in_pcbref(le->inp); } CURVNET_RESTORE(); if ((need_flush == 0) && (th->th_flags & TH_ACK) && (tcp_data_len == 0) && (le->flags & CAN_USE_ACKCMP)) { /* Ok this is a pure ack lets build our special COMPRESS mbuf */ struct mbuf *nm; struct tcp_ackent *ack_ent; /* Question what is going on with the last mbuf on the inp queue, can we use it? */ INP_WLOCK_ASSERT(le->inp); nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); if (nm) { int idx; /* We can add in to the one on the tail */ ack_ent = mtod(nm, struct tcp_ackent *); idx = (nm->m_len / sizeof(struct tcp_ackent)); build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); nm->m_len += sizeof(struct tcp_ackent); nm->m_pkthdr.len += sizeof(struct tcp_ackent); le->ack_seq = th->th_ack; le->window = th->th_win; m_freem(m); counter_u64_add(tcp_extra_mbuf, 1); le->m_head = NULL; le->m_tail = NULL; le->m_last_mbuf = NULL; le->m_prev_last = NULL; INP_WUNLOCK(le->inp); return (0); } else { if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); else { nm = m_gethdr(M_NOWAIT, MT_DATA); nm->m_flags |= M_ACKCMP; } if (nm) { nm->m_pkthdr.rcvif = lc->ifp; ack_ent = mtod(nm, struct tcp_ackent *); build_ack_entry(ack_ent, th, m, hdr_len, iptos); m_freem(m); m = nm; m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); le->flags |= HAS_COMP_ENTRIES; le->cmp_ack_cnt++; } } } if (m->m_flags & M_ACKCMP) { counter_u64_add(tcp_comp_total, 1); } else { counter_u64_add(tcp_uncomp_total, 1); } skip_lookup_b: #endif if (need_flush) le->need_wakeup = need_flush; else le->need_wakeup = 0; m->m_nextpkt = NULL; le->m_head = m; le->m_tail = m_last(m); le->m_last_mbuf = m; le->m_prev_last = NULL; /* * We keep the total size here for cross checking when we may need * to flush/wakeup in the MBUF_QUEUE case. */ le->tcp_tot_p_len = tcp_data_len; if (locked) INP_WUNLOCK(le->inp); return (0); } int tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) { return tcp_lro_rx2(lc, m, csum, 1); } void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { struct timespec arrv; /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); return; } /* check if packet is not LRO capable */ if (__predict_false(mb->m_pkthdr.csum_flags == 0 || (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { /* input packet to network layer */ (*lc->ifp->if_input) (lc->ifp, mb); return; } /* Arrival Stamp the packet */ if ((mb->m_flags & M_TSTMP) == 0) { /* If no hardware or arrival stamp on the packet add arrival */ nanouptime(&arrv); mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + arrv.tv_nsec); mb->m_flags |= M_TSTMP_LRO; } /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | (((uint64_t)mb->m_pkthdr.flowid) << 24) | ((uint64_t)lc->lro_mbuf_count); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; /* flush if array is full */ if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); } /* end */ diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index e73a3e60fd64..b86a5d85fc76 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -1,940 +1,940 @@ /*- * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Author: Randall Stewart * This work is based on the ACM Queue paper * BBR - Congestion Based Congestion Control * and also numerous discussions with Neal, Yuchung and Van. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include "opt_kern_tls.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include #ifdef KERN_TLS #include #endif #include #include #include #ifdef NETFLIX_STATS #include /* Must come after qmath.h and tree.h */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #include #include #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "rack_bbr_common.h" /* * Common TCP Functions - These are shared by borth * rack and BBR. */ #ifdef KERN_TLS uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) { struct ktls_session *tls; uint32_t len; again: tls = so->so_snd.sb_tls_info; len = tls->params.max_frame_len; /* max tls payload */ len += tls->params.tls_hlen; /* tls header len */ len += tls->params.tls_tlen; /* tls trailer len */ if ((len * 4) > rwnd) { /* * Stroke this will suck counter and what * else should we do Drew? From the * TCP perspective I am not sure * what should be done... */ if (tls->params.max_frame_len > 4096) { tls->params.max_frame_len -= 4096; if (tls->params.max_frame_len < 4096) tls->params.max_frame_len = 4096; goto again; } } return (len); } #endif /* * The function ctf_process_inbound_raw() is used by * transport developers to do the steps needed to * support MBUF Queuing i.e. the flags in * inp->inp_flags2: * * - INP_SUPPORTS_MBUFQ * - INP_MBUF_QUEUE_READY * - INP_DONT_SACK_QUEUE * * These flags help control how LRO will deliver * packets to the transport. You first set in inp_flags2 * the INP_SUPPORTS_MBUFQ to tell the LRO code that you * will gladly take a queue of packets instead of a compressed * single packet. You also set in your t_fb pointer the * tfb_do_queued_segments to point to ctf_process_inbound_raw. * * This then gets you lists of inbound ACK's/Data instead * of a condensed compressed ACK/DATA packet. Why would you * want that? This will get you access to all the arrival * times of at least LRO and possibly at the Hardware (if * the interface card supports that) of the actual ACK/DATA. * In some transport designs this is important since knowing * the actual time we got the packet is useful information. * * Now there are some interesting Caveats that the transport * designer needs to take into account when using this feature. * * 1) It is used with HPTS and pacing, when the pacing timer * for output calls it will first call the input. * 2) When you set INP_MBUF_QUEUE_READY this tells LRO * queue normal packets, I am busy pacing out data and * will process the queued packets before my tfb_tcp_output * call from pacing. If a non-normal packet arrives, (e.g. sack) * you will be awoken immediately. * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even * be awoken if a SACK has arrived. You would do this when * you were not only running a pacing for output timer * but a Rack timer as well i.e. you know you are in recovery * and are in the process (via the timers) of dealing with * the loss. * * Now a critical thing you must be aware of here is that the * use of the flags has a far greater scope then just your * typical LRO. Why? Well thats because in the normal compressed * LRO case at the end of a driver interupt all packets are going * to get presented to the transport no matter if there is one * or 100. With the MBUF_QUEUE model, this is not true. You will * only be awoken to process the queue of packets when: * a) The flags discussed above allow it. * * b) You exceed a ack or data limit (by default the * ack limit is infinity (64k acks) and the data * limit is 64k of new TCP data) * * c) The push bit has been set by the peer */ int ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) { /* * We are passed a raw change of mbuf packets * that arrived in LRO. They are linked via * the m_nextpkt link in the pkt-headers. * * We process each one by: * a) saving off the next * b) stripping off the ether-header * c) formulating the arguments for * the tfb_tcp_hpts_do_segment * d) calling each mbuf to tfb_tcp_hpts_do_segment * after adjusting the time to match the arrival time. * Note that the LRO code assures no IP options are present. * * The symantics for calling tfb_tcp_hpts_do_segment are the * following: * 1) It returns 0 if all went well and you (the caller) need * to release the lock. * 2) If nxt_pkt is set, then the function will surpress calls * to tfb_tcp_output() since you are promising to call again * with another packet. * 3) If it returns 1, then you must free all the packets being * shipped in, the tcb has been destroyed (or about to be destroyed). */ struct mbuf *m_save; struct ether_header *eh; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip = NULL; /* Keep compiler happy. */ #endif struct ifnet *ifp; struct timeval tv; int32_t retval, nxt_pkt, tlen, off; uint16_t etype; uint16_t drop_hdrlen; uint8_t iptos, no_vn=0, bpf_req=0; NET_EPOCH_ASSERT(); if (m && m->m_pkthdr.rcvif) ifp = m->m_pkthdr.rcvif; else ifp = NULL; if (ifp) { bpf_req = bpf_peers_present(ifp->if_bpf); } else { /* * We probably should not work around * but kassert, since lro alwasy sets rcvif. */ no_vn = 1; goto skip_vnet; } CURVNET_SET(ifp->if_vnet); skip_vnet: while (m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; /* Now lets get the ether header */ eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Let the BPF see the packet */ if (bpf_req && ifp) ETHER_BPF_MTAP(ifp, m); m_adj(m, sizeof(*eh)); /* Trim off the ethernet header */ switch (etype) { #ifdef INET6 case ETHERTYPE_IPV6: { if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { KMOD_TCPSTAT_INC(tcps_rcvshort); m_freem(m); goto skipped_pkt; } } ip6 = (struct ip6_hdr *)(eh + 1); th = (struct tcphdr *)(ip6 + 1); tlen = ntohs(ip6->ip6_plen); drop_hdrlen = sizeof(*ip6); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); if (th->th_sum) { KMOD_TCPSTAT_INC(tcps_rcvbadsum); m_freem(m); goto skipped_pkt; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ m_freem(m); goto skipped_pkt; } - iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + iptos = IPV6_TRAFFIC_CLASS(ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { KMOD_TCPSTAT_INC(tcps_rcvshort); m_freem(m); goto skipped_pkt; } } ip = (struct ip *)(eh + 1); th = (struct tcphdr *)(ip + 1); drop_hdrlen = sizeof(*ip); iptos = ip->ip_tos; tlen = ntohs(ip->ip_len) - sizeof(struct ip); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { int len; struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = drop_hdrlen + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; } if (th->th_sum) { KMOD_TCPSTAT_INC(tcps_rcvbadsum); m_freem(m); goto skipped_pkt; } break; } #endif } /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { KMOD_TCPSTAT_INC(tcps_rcvbadoff); m_freem(m); goto skipped_pkt; } tlen -= off; drop_hdrlen += off; /* * Now lets setup the timeval to be when we should * have been called (if we can). */ m->m_pkthdr.lro_nsegs = 1; if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; } else { /* Should not be should we kassert instead? */ tcp_get_usecs(&tv); } /* Now what about next packet? */ if (m_save || has_pkt) nxt_pkt = 1; else nxt_pkt = 0; KMOD_TCPSTAT_INC(tcps_rcvtotal); retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, iptos, nxt_pkt, &tv); if (retval) { /* We lost the lock and tcb probably */ m = m_save; while(m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = m_save; } if (no_vn == 0) CURVNET_RESTORE(); return(retval); } skipped_pkt: m = m_save; } if (no_vn == 0) CURVNET_RESTORE(); return(retval); } int ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) { struct mbuf *m; /* First lets see if we have old packets */ if (tp->t_in_pkt) { m = tp->t_in_pkt; tp->t_in_pkt = NULL; tp->t_tail_pkt = NULL; if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { /* We lost the tcpcb (maybe a RST came in)? */ return(1); } tcp_handle_wakeup(tp, so); } return (0); } uint32_t ctf_outstanding(struct tcpcb *tp) { uint32_t bytes_out; bytes_out = tp->snd_max - tp->snd_una; if (tp->t_state < TCPS_ESTABLISHED) bytes_out++; if (tp->t_flags & TF_SENTFIN) bytes_out++; return (bytes_out); } uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) { if (rc_sacked <= ctf_outstanding(tp)) return(ctf_outstanding(tp) - rc_sacked); else { /* TSNH */ #ifdef INVARIANTS panic("tp:%p rc_sacked:%d > out:%d", tp, rc_sacked, ctf_outstanding(tp)); #endif return (0); } } void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); } /* * ctf_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ int ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; int32_t tlen; thflags = *thf; tlen = *tlenp; todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; KMOD_TCPSTAT_INC(tcps_rcvduppack); KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { KMOD_TCPSTAT_INC(tcps_rcvpartduppack); KMOD_TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If segment ends after window, drop trailing data (and PUSH and * FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment and * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; KMOD_TCPSTAT_INC(tcps_rcvwinprobe); } else { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH | TH_FIN); } *thf = thflags; *tlenp = tlen; return (0); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ void ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence * space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all paths to this * code happen after packets containing RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the segment * we received passes the SYN-RECEIVED ACK test. If it fails send a * RST. This breaks the loop in the "LAND" DoS attack, and also * prevents an ACK storm between two listening ports that have been * sent forged SYN segments, each with the source address of the * other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } else *ret_val = 0; tp->t_flags |= TF_ACKNOW; if (m) m_freem(m); } void ctf_do_drop(struct mbuf *m, struct tcpcb *tp) { /* * Drop space held by incoming segment and return. */ if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (m) m_freem(m); } int ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in * window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should test against * last_ack_sent instead of rcv_nxt. Note 2: we handle special case * of closed window, not covered by the RFC. */ int dropped = 0; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || ((tp->last_ack_sent - 1) == th->th_seq)) { KMOD_TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); tp = tcp_close(tp); } dropped = 1; ctf_do_drop(m, tp); } else { KMOD_TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } else { m_freem(m); } return (dropped); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ void ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { NET_EPOCH_ASSERT(); KMOD_TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; ctf_do_drop(m, tp); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; ctf_do_drop(m, NULL); } } /* * bbr_ts_check returns 1 for you should not proceed, the state * machine should return. It places in ret_val what should * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ int ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) { if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates ts_recent, * the age will be reset later and ts_recent will get a * valid value. If it does not, setting ts_recent to zero * will at least satisfy the requirement that zero be placed * in the timestamp echo reply when ts_recent isn't valid. * The age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be dropped * when ts_recent is old. */ tp->ts_recent = 0; } else { KMOD_TCPSTAT_INC(tcps_rcvduppack); KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, tlen); KMOD_TCPSTAT_INC(tcps_pawsdrop); *ret_val = 0; if (tlen) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); } else { ctf_do_drop(m, NULL); } return (1); } return (0); } void ctf_calc_rwin(struct socket *so, struct tcpcb *tp) { int32_t win; /* * Calculate amount of space in receive window, and then do TCP * input processing. Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } void ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp->t_inpcb) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); } tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } uint32_t ctf_fixed_maxseg(struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. * */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } void ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex8 = num_sack_blks; if (num_sack_blks > 0) { log.u_bbr.flex1 = sack_blocks[0].start; log.u_bbr.flex2 = sack_blocks[0].end; } if (num_sack_blks > 1) { log.u_bbr.flex3 = sack_blocks[1].start; log.u_bbr.flex4 = sack_blocks[1].end; } if (num_sack_blks > 2) { log.u_bbr.flex5 = sack_blocks[2].start; log.u_bbr.flex6 = sack_blocks[2].end; } if (num_sack_blks > 3) { log.u_bbr.applimited = sack_blocks[3].start; log.u_bbr.pkts_out = sack_blocks[3].end; } TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_SACK_FILTER_RES, 0, 0, &log, false, &tv); } } uint32_t ctf_decay_count(uint32_t count, uint32_t decay) { /* * Given a count, decay it by a set percentage. The * percentage is in thousands i.e. 100% = 1000, * 19.3% = 193. */ uint64_t perc_count, decay_per; uint32_t decayed_count; if (decay > 1000) { /* We don't raise it */ return (count); } perc_count = count; decay_per = decay; perc_count *= decay_per; perc_count /= 1000; /* * So now perc_count holds the * count decay value. */ decayed_count = count - (uint32_t)perc_count; return(decayed_count); } int32_t ctf_progress_timeout_check(struct tcpcb *tp, bool log) { if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* * There is an assumption that the caller * will drop the connection so we will * increment the counters here. */ if (log) tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS); #ifdef NETFLIX_STATS KMOD_TCPSTAT_INC(tcps_progdrops); #endif return (1); } } return (0); } diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c index f227930743b7..1903b01e03d4 100644 --- a/sys/netinet6/frag6.c +++ b/sys/netinet6/frag6.c @@ -1,1049 +1,1048 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * Copyright (c) 2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* For ECN definitions. */ #include /* For ECN definitions. */ #ifdef MAC #include #endif /* * A "big picture" of how IPv6 fragment queues are all linked together. * * struct ip6qbucket ip6qb[...]; hashed buckets * |||||||| * | * +--- TAILQ(struct ip6q, packets) *q6; tailq entries holding * |||||||| fragmented packets * | (1 per original packet) * | * +--- TAILQ(struct ip6asfrag, ip6q_frags) *af6; tailq entries of IPv6 * | *ip6af;fragment packets * | for one original packet * + *mbuf */ /* Reassembly headers are stored in hash buckets. */ #define IP6REASS_NHASH_LOG2 10 #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #define IP6REASS_HMASK (IP6REASS_NHASH - 1) TAILQ_HEAD(ip6qhead, ip6q); struct ip6qbucket { struct ip6qhead packets; struct mtx lock; int count; }; struct ip6asfrag { TAILQ_ENTRY(ip6asfrag) ip6af_tq; struct mbuf *ip6af_m; int ip6af_offset; /* Offset in ip6af_m to next header. */ int ip6af_frglen; /* Fragmentable part length. */ int ip6af_off; /* Fragment offset. */ bool ip6af_mff; /* More fragment bit in frag off. */ }; static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header"); #ifdef VIMAGE /* A flag to indicate if IPv6 fragmentation is initialized. */ VNET_DEFINE_STATIC(bool, frag6_on); #define V_frag6_on VNET(frag6_on) #endif /* System wide (global) maximum and count of packets in reassembly queues. */ static int ip6_maxfrags; static volatile u_int frag6_nfrags = 0; /* Maximum and current packets in per-VNET reassembly queue. */ VNET_DEFINE_STATIC(int, ip6_maxfragpackets); VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_frag6_nfragpackets VNET(frag6_nfragpackets) /* Maximum per-VNET reassembly queues per bucket and fragments per packet. */ VNET_DEFINE_STATIC(int, ip6_maxfragbucketsize); VNET_DEFINE_STATIC(int, ip6_maxfragsperpacket); #define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) #define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) /* Per-VNET reassembly queue buckets. */ VNET_DEFINE_STATIC(struct ip6qbucket, ip6qb[IP6REASS_NHASH]); VNET_DEFINE_STATIC(uint32_t, ip6qb_hashseed); #define V_ip6qb VNET(ip6qb) #define V_ip6qb_hashseed VNET(ip6qb_hashseed) #define IP6QB_LOCK(_b) mtx_lock(&V_ip6qb[(_b)].lock) #define IP6QB_TRYLOCK(_b) mtx_trylock(&V_ip6qb[(_b)].lock) #define IP6QB_LOCK_ASSERT(_b) mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED) #define IP6QB_UNLOCK(_b) mtx_unlock(&V_ip6qb[(_b)].lock) #define IP6QB_HEAD(_b) (&V_ip6qb[(_b)].packets) /* * By default, limit the number of IP6 fragments across all reassembly * queues to 1/32 of the total number of mbuf clusters. * * Limit the total number of reassembly queues per VNET to the * IP6 fragment limit, but ensure the limit will not allow any bucket * to grow above 100 items. (The bucket limit is * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct * multiplier to reach a 100-item limit.) * The 100-item limit was chosen as brief testing seems to show that * this produces "reasonable" performance on some subset of systems * under DoS attack. */ #define IP6_MAXFRAGS (nmbclusters / 32) #define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) /* * Sysctls and helper function. */ SYSCTL_DECL(_net_inet6_ip6); SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfrags, CTLFLAG_RD, __DEVOLATILE(u_int *, &frag6_nfrags), 0, "Global number of IPv6 fragments across all reassembly queues."); static void frag6_set_bucketsize(void) { int i; if ((i = V_ip6_maxfragpackets) > 0) V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, "Maximum allowed number of outstanding IPv6 packet fragments. " "A value of 0 means no fragmented packets will be accepted, while a " "a value of -1 means no limit"); static int sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_maxfragpackets; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); V_ip6_maxfragpackets = val; frag6_set_bucketsize(); return (0); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_maxfragpackets, "I", "Default maximum number of outstanding fragmented IPv6 packets. " "A value of 0 means no fragmented packets will be accepted, while a " "a value of -1 means no limit"); SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfragpackets, CTLFLAG_VNET | CTLFLAG_RD, __DEVOLATILE(u_int *, &VNET_NAME(frag6_nfragpackets)), 0, "Per-VNET number of IPv6 fragments across all reassembly queues."); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, "Maximum allowed number of fragments per packet"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, "Maximum number of reassembly queues per hash bucket"); /* * Remove the IPv6 fragmentation header from the mbuf. */ int ip6_deletefraghdr(struct mbuf *m, int offset, int wait __unused) { struct ip6_hdr *ip6; KASSERT(m->m_len >= offset + sizeof(struct ip6_frag), ("%s: ext headers not contigous in mbuf %p m_len %d >= " "offset %d + %zu\n", __func__, m, m->m_len, offset, sizeof(struct ip6_frag))); /* Delete frag6 header. */ ip6 = mtod(m, struct ip6_hdr *); bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), offset); m->m_data += sizeof(struct ip6_frag); m->m_len -= sizeof(struct ip6_frag); m->m_flags |= M_FRAGMENTED; return (0); } /* * Free a fragment reassembly header and all associated datagrams. */ static void frag6_freef(struct ip6q *q6, uint32_t bucket) { struct ip6_hdr *ip6; struct ip6asfrag *af6; struct mbuf *m; IP6QB_LOCK_ASSERT(bucket); while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { m = af6->ip6af_m; TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); /* * Return ICMP time exceeded error for the 1st fragment. * Just free other fragments. */ if (af6->ip6af_off == 0 && m->m_pkthdr.rcvif != NULL) { /* Adjust pointer. */ ip6 = mtod(m, struct ip6_hdr *); /* Restore source and destination addresses. */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_REASSEMBLY, 0); } else m_freem(m); free(af6, M_FRAG6); } TAILQ_REMOVE(IP6QB_HEAD(bucket), q6, ip6q_tq); V_ip6qb[bucket].count--; atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); } /* * Drain off all datagram fragments belonging to * the given network interface. */ static void frag6_cleanup(void *arg __unused, struct ifnet *ifp) { struct ip6qhead *head; struct ip6q *q6; struct ip6asfrag *af6; uint32_t bucket; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); CURVNET_SET_QUIET(ifp->if_vnet); #ifdef VIMAGE /* * Skip processing if IPv6 reassembly is not initialised or * torn down by frag6_destroy(). */ if (!V_frag6_on) { CURVNET_RESTORE(); return; } #endif for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); /* Scan fragment list. */ TAILQ_FOREACH(q6, head, ip6q_tq) { TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { /* Clear no longer valid rcvif pointer. */ if (af6->ip6af_m->m_pkthdr.rcvif == ifp) af6->ip6af_m->m_pkthdr.rcvif = NULL; } } IP6QB_UNLOCK(bucket); } CURVNET_RESTORE(); } EVENTHANDLER_DEFINE(ifnet_departure_event, frag6_cleanup, NULL, 0); /* * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with * each other, in terms of next header field handling in fragment header. * While the sender will use the same value for all of the fragmented packets, * receiver is suggested not to check for consistency. * * Fragment rules (p18,p19): * (2) A Fragment header containing: * The Next Header value that identifies the first header * after the Per-Fragment headers of the original packet. * -> next header field is same for all fragments * * Reassembly rule (p20): * The Next Header field of the last header of the Per-Fragment * headers is obtained from the Next Header field of the first * fragment's Fragment header. * -> should grab it from the first fragment only * * The following note also contradicts with fragment rule - no one is going to * send different fragment with different next header field. * * Additional note (p22) [not an error]: * The Next Header values in the Fragment headers of different * fragments of the same original packet may differ. Only the value * from the Offset zero fragment packet is used for reassembly. * -> should grab it from the first fragment only * * There is no explicit reason given in the RFC. Historical reason maybe? */ /* * Fragment input. */ int frag6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; struct ip6qhead *head; struct ip6q *q6; struct ip6asfrag *af6, *ip6af, *af6tmp; struct in6_ifaddr *ia6; struct ifnet *dstifp, *srcifp; uint32_t hashkey[(sizeof(struct in6_addr) * 2 + sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)]; uint32_t bucket, *hashkeyp; int fragoff, frgpartlen; /* Must be larger than uint16_t. */ int nxt, offset, plen; uint8_t ecn, ecn0; bool only_frag; #ifdef RSS struct ip6_direct_ctx *ip6dc; struct m_tag *mtag; #endif m = *mp; offset = *offp; M_ASSERTPKTHDR(m); if (m->m_len < offset + sizeof(struct ip6_frag)) { m = m_pullup(m, offset + sizeof(struct ip6_frag)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); dstifp = NULL; /* Find the destination interface of the packet. */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL) dstifp = ia6->ia_ifp; /* Jumbo payload cannot contain a fragment header. */ if (ip6->ip6_plen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); *mp = NULL; return (IPPROTO_DONE); } /* * Check whether fragment packet's fragment length is a * multiple of 8 octets (unless it is the last one). * sizeof(struct ip6_frag) == 8 * sizeof(struct ip6_hdr) = 40 */ ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) && (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); *mp = NULL; return (IPPROTO_DONE); } IP6STAT_INC(ip6s_fragments); in6_ifstat_inc(dstifp, ifs6_reass_reqd); /* * Handle "atomic" fragments (offset and m bit set to 0) upfront, * unrelated to any reassembly. We need to remove the frag hdr * which is ugly. * See RFC 6946 and section 4.5 of RFC 8200. */ if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { IP6STAT_INC(ip6s_atomicfrags); nxt = ip6f->ip6f_nxt; /* * Set nxt(-hdr field value) to the original value. * We cannot just set ip6->ip6_nxt as there might be * an unfragmentable part with extension headers and * we must update the last one. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct ip6_frag)); if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) goto dropfrag2; m->m_pkthdr.len -= sizeof(struct ip6_frag); in6_ifstat_inc(dstifp, ifs6_reass_ok); *mp = m; return (nxt); } /* Offset now points to data portion. */ offset += sizeof(struct ip6_frag); /* Get fragment length and discard 0-byte fragments. */ frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (frgpartlen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); *mp = NULL; return (IPPROTO_DONE); } /* * Enforce upper bound on number of fragments for the entire system. * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ if (ip6_maxfrags < 0) ; else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags) goto dropfrag2; /* * Validate that a full header chain to the ULP is present in the * packet containing the first fragment as per RFC RFC7112 and * RFC 8200 pages 18,19: * The first fragment packet is composed of: * (3) Extension headers, if any, and the Upper-Layer header. These * headers must be in the first fragment. ... */ fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); /* XXX TODO. thj has D16851 open for this. */ /* Send ICMPv6 4,3 in case of violation. */ /* Store receive network interface pointer for later. */ srcifp = m->m_pkthdr.rcvif; /* Generate a hash value for fragment bucket selection. */ hashkeyp = hashkey; memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); *hashkeyp = ip6f->ip6f_ident; bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed); bucket &= IP6REASS_HMASK; IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); TAILQ_FOREACH(q6, head, ip6q_tq) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) #ifdef MAC && mac_ip6q_match(m, q6) #endif ) break; only_frag = false; if (q6 == NULL) { /* A first fragment to arrive creates a reassembly queue. */ only_frag = true; /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly; * If maxfragpackets is 0, never accept fragments. * If maxfragpackets is -1, accept all fragments without * limitation. */ if (V_ip6_maxfragpackets < 0) ; else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize || atomic_load_int(&V_frag6_nfragpackets) >= (u_int)V_ip6_maxfragpackets) goto dropfrag; /* Allocate IPv6 fragement packet queue entry. */ q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FRAG6, M_NOWAIT | M_ZERO); if (q6 == NULL) goto dropfrag; #ifdef MAC if (mac_ip6q_init(q6, M_NOWAIT) != 0) { free(q6, M_FRAG6); goto dropfrag; } mac_ip6q_create(m, q6); #endif atomic_add_int(&V_frag6_nfragpackets, 1); /* ip6q_nxt will be filled afterwards, from 1st fragment. */ TAILQ_INIT(&q6->ip6q_frags); q6->ip6q_ident = ip6f->ip6f_ident; q6->ip6q_ttl = IPV6_FRAGTTL; q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; - q6->ip6q_ecn = - (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; + q6->ip6q_ecn = IPV6_ECN(ip6); q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ /* Add the fragemented packet to the bucket. */ TAILQ_INSERT_HEAD(head, q6, ip6q_tq); V_ip6qb[bucket].count++; } /* * If it is the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. * Assume the first 1st fragement to arrive will be correct. * We do not have any duplicate checks here yet so another packet * with fragoff == 0 could come and overwrite the ip6q_unfrglen * and worse, the next header, at any time. */ if (fragoff == 0 && q6->ip6q_unfrglen == -1) { q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag); q6->ip6q_nxt = ip6f->ip6f_nxt; /* XXX ECN? */ } /* * Check that the reassembled packet would not exceed 65535 bytes * in size. * If it would exceed, discard the fragment and return an ICMP error. */ if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { if (only_frag) { TAILQ_REMOVE(head, q6, ip6q_tq); V_ip6qb[bucket].count--; atomic_subtract_int(&V_frag6_nfragpackets, 1); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FRAG6); } IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); *mp = NULL; return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { if (only_frag) { TAILQ_REMOVE(head, q6, ip6q_tq); V_ip6qb[bucket].count--; atomic_subtract_int(&V_frag6_nfragpackets, 1); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FRAG6); } IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); *mp = NULL; return (IPPROTO_DONE); } /* * If it is the first fragment, do the above check for each * fragment already stored in the reassembly queue. */ if (fragoff == 0 && !only_frag) { TAILQ_FOREACH_SAFE(af6, &q6->ip6q_frags, ip6af_tq, af6tmp) { if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen > IPV6_MAXPACKET) { struct ip6_hdr *ip6err; struct mbuf *merr; int erroff; merr = af6->ip6af_m; erroff = af6->ip6af_offset; /* Dequeue the fragment. */ TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); q6->ip6q_nfrag--; atomic_subtract_int(&frag6_nfrags, 1); free(af6, M_FRAG6); /* Set a valid receive interface pointer. */ merr->m_pkthdr.rcvif = srcifp; /* Adjust pointer. */ ip6err = mtod(merr, struct ip6_hdr *); /* * Restore source and destination addresses * in the erroneous IPv6 header. */ ip6err->ip6_src = q6->ip6q_src; ip6err->ip6_dst = q6->ip6q_dst; icmp6_error(merr, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); } } } /* Allocate an IPv6 fragement queue entry for this fragmented part. */ ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FRAG6, M_NOWAIT | M_ZERO); if (ip6af == NULL) goto dropfrag; ip6af->ip6af_mff = (ip6f->ip6f_offlg & IP6F_MORE_FRAG) ? true : false; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; ip6af->ip6af_offset = offset; ip6af->ip6af_m = m; if (only_frag) { /* * Do a manual insert rather than a hard-to-understand cast * to a different type relying on data structure order to work. */ TAILQ_INSERT_HEAD(&q6->ip6q_frags, ip6af, ip6af_tq); goto postinsert; } /* Do duplicate, condition, and boundry checks. */ /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * Drop if CE and not-ECT are mixed for the same packet. */ - ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; + ecn = IPV6_ECN(ip6); ecn0 = q6->ip6q_ecn; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) { free(ip6af, M_FRAG6); goto dropfrag; } if (ecn0 != IPTOS_ECN_CE) q6->ip6q_ecn = IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { free(ip6af, M_FRAG6); goto dropfrag; } /* Find a fragmented part which begins after this one does. */ TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) if (af6->ip6af_off > ip6af->ip6af_off) break; /* * If the incoming framgent overlaps some existing fragments in * the reassembly queue, drop both the new fragment and the * entire reassembly queue. However, if the new fragment * is an exact duplicate of an existing fragment, only silently * drop the existing fragment and leave the fragmentation queue * unchanged, as allowed by the RFC. (RFC 8200, 4.5) */ if (af6 != NULL) af6tmp = TAILQ_PREV(af6, ip6fraghead, ip6af_tq); else af6tmp = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); if (af6tmp != NULL) { if (af6tmp->ip6af_off + af6tmp->ip6af_frglen - ip6af->ip6af_off > 0) { if (af6tmp->ip6af_off != ip6af->ip6af_off || af6tmp->ip6af_frglen != ip6af->ip6af_frglen) frag6_freef(q6, bucket); free(ip6af, M_FRAG6); goto dropfrag; } } if (af6 != NULL) { if (ip6af->ip6af_off + ip6af->ip6af_frglen - af6->ip6af_off > 0) { if (af6->ip6af_off != ip6af->ip6af_off || af6->ip6af_frglen != ip6af->ip6af_frglen) frag6_freef(q6, bucket); free(ip6af, M_FRAG6); goto dropfrag; } } #ifdef MAC mac_ip6q_update(m, q6); #endif /* * Stick new segment in its place; check for complete reassembly. * If not complete, check fragment limit. Move to front of packet * queue, as we are the most recently active fragmented packet. */ if (af6 != NULL) TAILQ_INSERT_BEFORE(af6, ip6af, ip6af_tq); else TAILQ_INSERT_TAIL(&q6->ip6q_frags, ip6af, ip6af_tq); postinsert: atomic_add_int(&frag6_nfrags, 1); q6->ip6q_nfrag++; plen = 0; TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { if (af6->ip6af_off != plen) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); *mp = NULL; return (IPPROTO_DONE); } plen += af6->ip6af_frglen; } af6 = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); if (af6->ip6af_mff) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); *mp = NULL; return (IPPROTO_DONE); } /* Reassembly is complete; concatenate fragments. */ ip6af = TAILQ_FIRST(&q6->ip6q_frags); t = m = ip6af->ip6af_m; TAILQ_REMOVE(&q6->ip6q_frags, ip6af, ip6af_tq); while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { m->m_pkthdr.csum_flags &= af6->ip6af_m->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += af6->ip6af_m->m_pkthdr.csum_data; TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); t = m_last(t); m_adj(af6->ip6af_m, af6->ip6af_offset); m_demote_pkthdr(af6->ip6af_m); m_cat(t, af6->ip6af_m); free(af6, M_FRAG6); } while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); /* Adjust offset to point where the original next header starts. */ offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); free(ip6af, M_FRAG6); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr)); if (q6->ip6q_ecn == IPTOS_ECN_CE) ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); nxt = q6->ip6q_nxt; TAILQ_REMOVE(head, q6, ip6q_tq); V_ip6qb[bucket].count--; atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); ip6_deletefraghdr(m, offset, M_NOWAIT); /* Set nxt(-hdr field value) to the original value. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; /* Set a valid receive interface pointer. */ m->m_pkthdr.rcvif = srcifp; } #ifdef RSS mtag = m_tag_alloc(MTAG_ABI_IPV6, IPV6_TAG_DIRECT, sizeof(*ip6dc), M_NOWAIT); if (mtag == NULL) goto dropfrag; ip6dc = (struct ip6_direct_ctx *)(mtag + 1); ip6dc->ip6dc_nxt = nxt; ip6dc->ip6dc_off = offset; m_tag_prepend(m, mtag); #endif IP6QB_UNLOCK(bucket); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); #ifdef RSS /* Queue/dispatch for reprocessing. */ netisr_dispatch(NETISR_IPV6_DIRECT, m); *mp = NULL; return (IPPROTO_DONE); #endif /* Tell launch routine the next header. */ *mp = m; *offp = offset; return (nxt); dropfrag: IP6QB_UNLOCK(bucket); dropfrag2: in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); *mp = NULL; return (IPPROTO_DONE); } /* * IPv6 reassembling timer processing; * if a timer expires on a reassembly queue, discard it. */ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ip6qhead *head; struct ip6q *q6, *q6tmp; uint32_t bucket; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); TAILQ_FOREACH_SAFE(q6, head, ip6q_tq, q6tmp) if (--q6->ip6q_ttl == 0) { IP6STAT_ADD(ip6s_fragtimeout, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. * Note that we drain all reassembly queues if * maxfragpackets is 0 (fragmentation is disabled), * and do not enforce a limit when maxfragpackets * is negative. */ while ((V_ip6_maxfragpackets == 0 || (V_ip6_maxfragpackets > 0 && V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) && (q6 = TAILQ_LAST(head, ip6qhead)) != NULL) { IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); } /* * If we are still over the maximum number of fragmented * packets, drain off enough to get down to the new limit. */ bucket = 0; while (V_ip6_maxfragpackets >= 0 && atomic_load_int(&V_frag6_nfragpackets) > (u_int)V_ip6_maxfragpackets) { IP6QB_LOCK(bucket); q6 = TAILQ_LAST(IP6QB_HEAD(bucket), ip6qhead); if (q6 != NULL) { IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); bucket = (bucket + 1) % IP6REASS_NHASH; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Eventhandler to adjust limits in case nmbclusters change. */ static void frag6_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); ip6_maxfrags = IP6_MAXFRAGS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Initialise reassembly queue and fragment identifier. */ void frag6_init(void) { uint32_t bucket; V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { TAILQ_INIT(IP6QB_HEAD(bucket)); mtx_init(&V_ip6qb[bucket].lock, "ip6qb", NULL, MTX_DEF); V_ip6qb[bucket].count = 0; } V_ip6qb_hashseed = arc4random(); V_ip6_maxfragsperpacket = 64; #ifdef VIMAGE V_frag6_on = true; #endif if (!IS_DEFAULT_VNET(curvnet)) return; ip6_maxfrags = IP6_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } /* * Drain off all datagram fragments. */ static void frag6_drain_one(void) { struct ip6q *q6; uint32_t bucket; for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { IP6QB_LOCK(bucket); while ((q6 = TAILQ_FIRST(IP6QB_HEAD(bucket))) != NULL) { IP6STAT_INC(ip6s_fragdropped); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); } } void frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); frag6_drain_one(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } #ifdef VIMAGE /* * Clear up IPv6 reassembly structures. */ void frag6_destroy(void) { uint32_t bucket; frag6_drain_one(); V_frag6_on = false; for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { KASSERT(V_ip6qb[bucket].count == 0, ("%s: V_ip6qb[%d] (%p) count not 0 (%d)", __func__, bucket, &V_ip6qb[bucket], V_ip6qb[bucket].count)); mtx_destroy(&V_ip6qb[bucket].lock); } } #endif diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index 33cc06d065b8..54cb81c6130f 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -1,484 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #include #include #include #include #define GIF_HLIM 30 VNET_DEFINE_STATIC(int, ip6_gif_hlim) = GIF_HLIM; #define V_ip6_gif_hlim VNET(ip6_gif_hlim) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gif_hlim), 0, "Default hop limit for encapsulated packets"); /* * We keep interfaces in a hash table using src+dst as key. * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list. */ VNET_DEFINE_STATIC(struct gif_list *, ipv6_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gif_list *, ipv6_srchashtbl) = NULL; VNET_DEFINE_STATIC(struct gif_list, ipv6_list) = CK_LIST_HEAD_INITIALIZER(); #define V_ipv6_hashtbl VNET(ipv6_hashtbl) #define V_ipv6_srchashtbl VNET(ipv6_srchashtbl) #define V_ipv6_list VNET(ipv6_list) #define GIF_HASH(src, dst) (V_ipv6_hashtbl[\ in6_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)]) #define GIF_SRCHASH(src) (V_ipv6_srchashtbl[\ fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GIF_HASH_SIZE - 1)]) #define GIF_HASH_SC(sc) GIF_HASH(&(sc)->gif_ip6hdr->ip6_src,\ &(sc)->gif_ip6hdr->ip6_dst) static uint32_t in6_gif_hashval(const struct in6_addr *src, const struct in6_addr *dst) { uint32_t ret; ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT); return (fnv_32_buf(dst, sizeof(*dst), ret)); } static int in6_gif_checkdup(const struct gif_softc *sc, const struct in6_addr *src, const struct in6_addr *dst) { struct gif_softc *tmp; if (sc->gif_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, src) && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, dst)) return (EEXIST); CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) { if (tmp == sc) continue; if (IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_src, src) && IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_dst, dst)) return (EADDRNOTAVAIL); } return (0); } /* * Check that ingress address belongs to local host. */ static void in6_gif_set_running(struct gif_softc *sc) { if (in6_localip(&sc->gif_ip6hdr->ip6_src)) GIF2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void in6_gif_srcaddr(void *arg __unused, const struct sockaddr *sa, int event) { const struct sockaddr_in6 *sin; struct gif_softc *sc; /* Check that VNET is ready */ if (V_ipv6_hashtbl == NULL) return; NET_EPOCH_ASSERT(); sin = (const struct sockaddr_in6 *)sa; CK_LIST_FOREACH(sc, &GIF_SRCHASH(&sin->sin6_addr), srchash) { if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &sin->sin6_addr) == 0) continue; in6_gif_set_running(sc); } } static void in6_gif_attach(struct gif_softc *sc) { if (sc->gif_options & GIF_IGNORE_SOURCE) CK_LIST_INSERT_HEAD(&V_ipv6_list, sc, chain); else CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GIF_SRCHASH(&sc->gif_ip6hdr->ip6_src), sc, srchash); } int in6_gif_setopts(struct gif_softc *sc, u_int options) { /* NOTE: we are protected with gif_ioctl_sx lock */ MPASS(sc->gif_family == AF_INET6); MPASS(sc->gif_options != options); if ((options & GIF_IGNORE_SOURCE) != (sc->gif_options & GIF_IGNORE_SOURCE)) { CK_LIST_REMOVE(sc, srchash); CK_LIST_REMOVE(sc, chain); sc->gif_options = options; in6_gif_attach(sc); } return (0); } int in6_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct sockaddr_in6 *dst, *src; struct ip6_hdr *ip6; int error; /* NOTE: we are protected with gif_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR_IN6: src = &((struct in6_aliasreq *)data)->ifra_addr; dst = &((struct in6_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin6_family != dst->sin6_family || src->sin6_family != AF_INET6 || src->sin6_len != dst->sin6_len || src->sin6_len != sizeof(*src)) break; if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) { error = EADDRNOTAVAIL; break; } /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ if ((error = sa6_embedscope(src, 0)) != 0 || (error = sa6_embedscope(dst, 0)) != 0) break; if (V_ipv6_hashtbl == NULL) { V_ipv6_hashtbl = gif_hashinit(); V_ipv6_srchashtbl = gif_hashinit(); } error = in6_gif_checkdup(sc, &src->sin6_addr, &dst->sin6_addr); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip6 = malloc(sizeof(*ip6), M_GIF, M_WAITOK | M_ZERO); ip6->ip6_src = src->sin6_addr; ip6->ip6_dst = dst->sin6_addr; ip6->ip6_vfc = IPV6_VERSION; if (sc->gif_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, srchash); CK_LIST_REMOVE(sc, chain); GIF_WAIT(); free(sc->gif_hdr, M_GIF); /* XXX: should we notify about link state change? */ } sc->gif_family = AF_INET6; sc->gif_ip6hdr = ip6; in6_gif_attach(sc); in6_gif_set_running(sc); break; case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gif_family != AF_INET6) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in6 *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin6_family = AF_INET6; src->sin6_len = sizeof(*src); src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ? sc->gif_ip6hdr->ip6_src: sc->gif_ip6hdr->ip6_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error == 0) error = sa6_recoverscope(src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; int len; /* prepend new IP header */ NET_EPOCH_ASSERT(); len = sizeof(struct ip6_hdr); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in6_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip6 = mtod(m, struct ip6_hdr *); MPASS(sc->gif_family == AF_INET6); bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr)); ip6->ip6_flow |= htonl((uint32_t)ecn << 20); ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; /* * force fragmentation to minimum MTU, to avoid path MTU discovery. * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL)); } static int in6_gif_input(struct mbuf *m, int off, int proto, void *arg) { struct gif_softc *sc = arg; struct ifnet *gifp; struct ip6_hdr *ip6; uint8_t ecn; NET_EPOCH_ASSERT(); if (sc == NULL) { m_freem(m); IP6STAT_INC(ip6s_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip6 = mtod(m, struct ip6_hdr *); - ecn = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + ecn = IPV6_TRAFFIC_CLASS(ip6); m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); IP6STAT_INC(ip6s_nogif); } return (IPPROTO_DONE); } static int in6_gif_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip6_hdr *ip6; struct gif_softc *sc; int ret; if (V_ipv6_hashtbl == NULL) return (0); NET_EPOCH_ASSERT(); /* * NOTE: it is safe to iterate without any locking here, because softc * can be reclaimed only when we are not within net_epoch_preempt * section, but ip_encap lookup+input are executed in epoch section. */ ip6 = mtod(m, const struct ip6_hdr *); ret = 0; CK_LIST_FOREACH(sc, &GIF_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) { /* * This is an inbound packet, its ip6_dst is source address * in softc. */ if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst) && IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, &ip6->ip6_src)) { ret = ENCAP_DRV_LOOKUP; goto done; } } /* * No exact match. * Check the list of interfaces with GIF_IGNORE_SOURCE flag. */ CK_LIST_FOREACH(sc, &V_ipv6_list, chain) { if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst)) { ret = 128 + 8; /* src + proto */ goto done; } } return (0); done: if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { if (fib6_check_urpf(sc->gif_fibnum, &ip6->ip6_src, ntohs(in6_getscope(&ip6->ip6_src)), NHR_NONE, m->m_pkthdr.rcvif) == 0) return (0); } *arg = sc; return (ret); } static const struct srcaddrtab *ipv6_srcaddrtab; static struct { const struct encap_config encap; const struct encaptab *cookie; } ipv6_encap_cfg[] = { #ifdef INET { .encap = { .proto = IPPROTO_IPV4, .min_length = sizeof(struct ip6_hdr) + sizeof(struct ip), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, }, #endif { .encap = { .proto = IPPROTO_IPV6, .min_length = 2 * sizeof(struct ip6_hdr), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, }, { .encap = { .proto = IPPROTO_ETHERIP, .min_length = sizeof(struct ip6_hdr) + sizeof(struct etherip_header) + sizeof(struct ether_header), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gif_lookup, .input = in6_gif_input }, } }; void in6_gif_init(void) { int i; if (!IS_DEFAULT_VNET(curvnet)) return; ipv6_srcaddrtab = ip6_encap_register_srcaddr(in6_gif_srcaddr, NULL, M_WAITOK); for (i = 0; i < nitems(ipv6_encap_cfg); i++) ipv6_encap_cfg[i].cookie = ip6_encap_attach( &ipv6_encap_cfg[i].encap, NULL, M_WAITOK); } void in6_gif_uninit(void) { int i; if (IS_DEFAULT_VNET(curvnet)) { for (i = 0; i < nitems(ipv6_encap_cfg); i++) ip6_encap_detach(ipv6_encap_cfg[i].cookie); ip6_encap_unregister_srcaddr(ipv6_srcaddrtab); } if (V_ipv6_hashtbl != NULL) { gif_hashdestroy(V_ipv6_hashtbl); V_ipv6_hashtbl = NULL; GIF_WAIT(); gif_hashdestroy(V_ipv6_srchashtbl); } } diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 58334788b05b..2b49a9f7c351 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,3376 +1,3376 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SCTP) || defined(SCTP_SUPPORT) #include #include #endif #include #include extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, * mp is the destination, and _ol is the optlen. */ #define MAKE_EXTHDR(hp, mp, _ol) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ (_ol) += (*(mp))->m_len; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("%s:%d: assumption failed: "\ "hdr not split: hdrsplit %d exthdrs %p",\ __func__, __LINE__, hdrsplit, &exthdrs);\ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } static int ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags, int plen, int optlen, bool frag) { KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p " "csum_flags %#x frag %d\n", __func__, __LINE__, plen, optlen, m, ifp, csum_flags, frag)); if ((csum_flags & CSUM_DELAY_DATA_IPV6) || #if defined(SCTP) || defined(SCTP_SUPPORT) (csum_flags & CSUM_SCTP_IPV6) || #endif (!frag && (ifp->if_capenable & IFCAP_MEXTPG) == 0)) { m = mb_unmapped_to_ext(m); if (m == NULL) { if (frag) in6_ifstat_inc(ifp, ifs6_out_fragfail); else IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } if (csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen - optlen, sizeof(struct ip6_hdr) + optlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif } return (0); } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int fraglen , uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8")); m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += fraglen) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + fraglen >= tlen) fraglen = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(fraglen + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } static int ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_epg_tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { error = EAGAIN; goto done; } /* * Always stamp tags that include NIC ktls. */ stamp_tag = true; } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* * IP6 output. * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len, * nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_nh initialized, route lookup would be * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL, * then result of route lookup is stored in ro->ro_nh. * * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu * is uint32_t. So we use u_long to hold largest one, which is rt_mtu. * * ifpp - XXX: just for statistics */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev; struct route_in6 *ro_pmtu; struct nhop_object *nh; struct sockaddr_in6 *dst, sin6, src_sa, dst_sa; struct in6_addr odst; u_char *nexthdrp; int tlen, len; int error = 0; int vlan_pcp = -1; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen, plen = 0, unfragpartlen; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; bool hdrsplit; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; NET_EPOCH_ASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* Unconditionally set flowid. */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } if ((inp->inp_flags2 & INP_2PCP_SET) != 0) vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ if (IPSEC_ENABLED(ipv6)) { if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } #endif /* IPSEC */ /* Source address validation. */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } /* * If we are given packet options to add extension headers prepare them. * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ bzero(&exthdrs, sizeof(exthdrs)); optlen = 0; unfragpartlen = sizeof(struct ip6_hdr); if (opt) { /* Hop-by-Hop options header. */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen); /* Destination options header (1st part). */ if (opt->ip6po_rthdr) { #ifndef RTHDR_SUPPORT_IMPLEMENTED /* * If there is a routing header, discard the packet * right away here. RH0/1 are obsolete and we do not * currently support RH2/3/4. * People trying to use RH253/254 may want to disable * this check. * The moment we do support any routing header (again) * this block should check the routing type more * selectively. */ error = EINVAL; goto bad; #endif /* * Destination options header (1st part). * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1, optlen); } /* Routing header. */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen); unfragpartlen += optlen; /* * NOTE: we don't add AH/ESP length here (done in * ip6_ipsec_output()). */ /* Destination options header (2nd part). */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen); } /* * If there is at least one extension header, * separate IP6 header from the payload. */ hdrsplit = false; if (optlen) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; ip6 = mtod(m, struct ip6_hdr *); hdrsplit = true; } /* Adjust mbuf packet header length. */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; ip6 = mtod(m, struct ip6_hdr *); hdrsplit = true; } if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); nexthdrp = &ip6->ip6_nxt; if (optlen) { /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. * Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]. * * During the header composing process "m" points to IPv6 * header. "mprev" points to an extension header prior to esp. */ mprev = m; /* * We treat dest2 specially. This makes IPsec processing * much easier. The goal here is to make mprev point the * mbuf prior to dest2. * * Result: IPv6 dest2 payload. * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("%s:%d: assumption failed: " "hdr not split: hdrsplit %d exthdrs %p", __func__, __LINE__, hdrsplit, &exthdrs); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * Result: IPv6 hbh dest1 rthdr dest2 payload. * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); } IP6STAT_INC(ip6s_localout); /* Route packet. */ ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; if (ro != NULL) dst = (struct sockaddr_in6 *)&ro->ro_dst; else dst = &sin6; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * If specified, try to fill in the traffic class field. * Do not override if a non-zero value is already set. * We check the diffserv field and the ECN field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; - if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) + if (IPV6_DSCP(ip6) == 0) mask |= 0xfc; - if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) + if (IPV6_ECN(ip6) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* Fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } if (ro == NULL || ro->ro_nh == NULL) { bzero(dst, sizeof(*dst)); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = ip6->ip6_dst; } /* * Validate route against routing table changes. * Make sure that the address family is set in route. */ nh = NULL; ifp = NULL; mtu = 0; if (ro != NULL) { if (ro->ro_nh != NULL && inp != NULL) { ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */ NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_nh != NULL && fwd_tag == NULL && (!NH_IS_VALID(ro->ro_nh) || ro->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst))) RO_INVALIDATE_CACHE(ro); if (ro->ro_nh != NULL && fwd_tag == NULL && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { nh = ro->ro_nh; ifp = nh->nh_ifp; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp, &nh, fibnum, m->m_pkthdr.flowid); if (error != 0) { IP6STAT_INC(ip6s_noroute); if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } if (ifp != NULL) mtu = ifp->if_mtu; } if (nh == NULL) { /* * If in6_selectroute() does not return a nexthop * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } else { if (nh->nh_flags & NHF_HOST) mtu = nh->nh_mtu; ia = (struct in6_ifaddr *)(nh->nh_ifa); counter_u64_add(nh->nh_pksent, 1); } } else { struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) && im6o != NULL && (ifp = im6o->im6o_multicast_ifp) != NULL) { /* We do not need a route lookup. */ *dst = dst_sa; /* XXX */ goto nonh6lookup; } in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid); if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) { if (scopeid > 0) { ifp = in6_getlinkifnet(scopeid); if (ifp == NULL) { error = EHOSTUNREACH; goto bad; } *dst = dst_sa; /* XXX */ goto nonh6lookup; } } nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, m->m_pkthdr.flowid); if (nh == NULL) { IP6STAT_INC(ip6s_noroute); /* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */ error = EHOSTUNREACH;; goto bad; } ifp = nh->nh_ifp; mtu = nh->nh_mtu; ia = ifatoia6(nh->nh_ifa); if (nh->nh_flags & NHF_GATEWAY) dst->sin6_addr = nh->gw6_sa.sin6_addr; nonh6lookup: ; } /* Then nh (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } /* Setup data structures for scope ID checks. */ src0 = ip6->ip6_src; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; dst0 = ip6->ip6_dst; /* Re-initialize to be sure. */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; /* Check for valid scope ID. */ if (in6_setscope(&src0, ifp, &zone) == 0 && sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id && in6_setscope(&dst0, ifp, &zone) == 0 && sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) { /* * The outgoing interface is in the zone of the source * and destination addresses. * * Because the loopback interface cannot receive * packets with a different scope ID than its own, * there is a trick to pretend the outgoing packet * was received by the real network interface, by * setting "origifp" different from "ifp". This is * only allowed when "ifp" is a loopback network * interface. Refer to code in nd6_output_ifp() for * more details. */ origifp = ifp; /* * We should use ia_ifp to support the case of sending * packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; } else if ((ifp->if_flags & IFF_LOOPBACK) == 0 || sa6_recoverscope(&src_sa) != 0 || sa6_recoverscope(&dst_sa) != 0 || dst_sa.sin6_scope_id == 0 || (src_sa.sin6_scope_id != 0 && src_sa.sin6_scope_id != dst_sa.sin6_scope_id) || (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) { /* * If the destination network interface is not a * loopback interface, or the destination network * address has no scope ID, or the source address has * a scope ID set which is different from the * destination address one, or there is no network * interface representing this scope ID, the address * pair is considered invalid. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(ifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; } /* All scope ID checks are successful. */ if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_nh) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((nh->nh_flags & NHF_GATEWAY)) dst = &nh->gw6_sa; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p " "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro, ifp, alwaysfrag, fibnum)); /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * Clear embedded scope identifiers if necessary. * in6_clearscope() will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point. */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) { case PFIL_PASS: ip6 = mtod(m, struct ip6_hdr *); break; case PFIL_DROPPED: error = EACCES; /* FALLTHROUGH */ case PFIL_CONSUMED: goto done; } needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { if (ro != NULL) RO_INVALIDATE_CACHE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); if (ro != NULL) RO_INVALIDATE_CACHE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { if (ro != NULL) dst = (struct sockaddr_in6 *)&ro->ro_dst; else dst = &sin6; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * The logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request. */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ error = ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen, false); if (error != 0) goto bad; /* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */ tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* Case 4. */ /* Conflicting request - can't transmit. */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* Case 2-b. */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* Transmit packet without fragmentation. */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* Cases 1-a and 2-a. */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = ip6_output_send(inp, ifp, origifp, m, dst, ro, (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } /* Try to fragment the packet. Cases 1-b and 3. */ if (mtu < IPV6_MMTU) { /* Path MTU cannot be less than IPV6_MMTU. */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* Jumbo payload cannot be fragmented. */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ error = ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags, plen, optlen, true); if (error != 0) goto bad; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { ip6 = mtod(m, struct ip6_hdr *); nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id); if (error != 0) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* Remove leading garbage. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); error = ip6_output_send(inp, ifp, origifp, m, dst, ro, true); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem() checks if mbuf is NULL. */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct epoch_tracker et; struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; int error; in6_splitscope(dst, &kdst, &scopeid); NET_EPOCH_ENTER(et); nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0); if (nh != NULL) error = ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, NULL, 0); else error = EHOSTUNREACH; NET_EPOCH_EXIT(et); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst, sin6; u_long mtu; NET_EPOCH_ASSERT(); mtu = 0; if (ro_pmtu == NULL || do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ if (ro_pmtu != NULL) { sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; } else sa6_dst = &sin6; if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0); if (nh != NULL) { mtu = nh->nh_mtu; if (ro_pmtu != NULL) ro_pmtu->ro_mtu = mtu; } } else mtu = ro_pmtu->ro_mtu; } if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL) mtu = ro_pmtu->ro_nh->nh_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); /* TCP is known to react to pmtu changes so skip hc */ if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *inp = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif /* * Don't use more than a quarter of mbuf clusters. N.B.: * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow * on LP64 architectures, so cast to u_long to avoid undefined * behavior. ILP32 architectures cannot have nmbclusters * large enough to overflow for other reasons. */ #define IPV6_PKTOPTIONS_MBUF_LIMIT ((u_long)nmbclusters * MCLBYTES / 4) level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) { printf("ip6_ctloutput: mbuf limit hit\n"); error = ENOBUFS; break; } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; INP_WLOCK(inp); error = ip6_pcbopts(&inp->in6p_outputopts, m, so, sopt); INP_WUNLOCK(inp); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_ORIGDSTADDR: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif case IPV6_VLAN_PCP: if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ inp->in6p_hops = optval; if ((inp->inp_vflag & INP_IPV4) != 0) inp->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(inp); \ inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ OPTSET2_N(bit, val); \ INP_WUNLOCK(inp); \ } while (0) #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ INP_WLOCK(inp); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: OPTSET2292_EXCLUSIVE(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_RECVHOPLIMIT: OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: OPTSET2292_EXCLUSIVE(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: INP_WLOCK(inp); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { /* * The socket is already bound. */ INP_WUNLOCK(inp); error = EINVAL; break; } if (optval) { inp->inp_flags |= IN6P_IPV6_V6ONLY; inp->inp_vflag &= ~INP_IPV4; } else { inp->inp_flags &= ~IN6P_IPV6_V6ONLY; inp->inp_vflag |= INP_IPV4; } INP_WUNLOCK(inp); break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ OPTSET2292_EXCLUSIVE(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { INP_WLOCK(inp); inp->inp_rss_listen_bucket = optval; OPTSET2_N(INP_RSS_BUCKET_SET, 1); INP_WUNLOCK(inp); } else { error = EINVAL; } break; #endif case IPV6_VLAN_PCP: if ((optval >= -1) && (optval <= (INP_2PCP_MASK >> INP_2PCP_SHIFT))) { if (optval == -1) { INP_WLOCK(inp); inp->inp_flags2 &= ~(INP_2PCP_SET | INP_2PCP_MASK); INP_WUNLOCK(inp); } else { INP_WLOCK(inp); inp->inp_flags2 |= INP_2PCP_SET; inp->inp_flags2 &= ~INP_2PCP_MASK; inp->inp_flags2 |= optval << INP_2PCP_SHIFT; INP_WUNLOCK(inp); } } else error = EINVAL; break; } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(inp, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IPV6_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: case IPV6_VLAN_PCP: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = inp->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = inp->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = inp->inp_flowid; break; case IPV6_FLOWTYPE: optval = inp->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; case IPV6_VLAN_PCP: if (OPTBIT2(INP_2PCP_SET)) { optval = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; } else { optval = -1; } break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct in6_addr addr; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. * Copy faddr out of inp to avoid holding lock * on inp during route lookup. */ INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &addr, sizeof(addr)); INP_RUNLOCK(inp); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(inp, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *inp = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if (optval < -1 || (optval % 2) != 0) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else inp->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = inp->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else { opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT); if (opt == NULL) return (ENOMEM); } *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_NOWAIT); if (*pktopt == NULL) return (ENOBUFS); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ INP_RUNLOCK(inp); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ INP_RLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_RUNLOCK(inp); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ pktopt = inp->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(&pktopt->field, optdata, optdatalen); \ } else { \ free(optdata, M_TEMP); \ optdata = NULL; \ malloc_optdata = false; \ } \ } \ } while(0) #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field, \ (((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3) #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field, \ pktopt->field->sa_len) static int ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; int optdatalen = 0; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; INP_RLOCK(inp); pktopt = inp->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) deftclass = pktopt->ip6po_tclass; optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: GET_PKTOPT_EXT_HDR(ip6po_hbh); break; case IPV6_RTHDR: GET_PKTOPT_EXT_HDR(ip6po_rthdr); break; case IPV6_RTHDRDSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest1); break; case IPV6_DSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest2); break; case IPV6_NEXTHOP: GET_PKTOPT_SOCKADDR(ip6po_nexthop); break; case IPV6_USE_MIN_MTU: if (pktopt) defminmtu = pktopt->ip6po_minmtu; optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) defpreftemp = pktopt->ip6po_prefer_tempaddr; optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif INP_RUNLOCK(inp); return (ENOPROTOOPT); } INP_RUNLOCK(inp); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) free(optdata, M_TEMP); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_nh) { NH_FREE(pktopt->ip6po_nextroute.ro_nh); pktopt->ip6po_nextroute.ro_nh = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_nh) { NH_FREE(pktopt->ip6po_route.ro_nh); pktopt->ip6po_route.ro_nh = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *inp) { int len; if (!inp->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(inp->in6p_outputopts->ip6po_hbh); if (inp->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(inp->in6p_outputopts->ip6po_dest1); len += elen(inp->in6p_outputopts->ip6po_rthdr); len += elen(inp->in6p_outputopts->ip6po_dest2); return len; #undef elen } diff --git a/sys/netinet6/sctp6_usrreq.c b/sys/netinet6/sctp6_usrreq.c index b544d2975c6a..fcf15e4f81bf 100644 --- a/sys/netinet6/sctp6_usrreq.c +++ b/sys/netinet6/sctp6_usrreq.c @@ -1,1180 +1,1180 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef INET6 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int sctp6_input_with_port(struct mbuf **i_pak, int *offp, uint16_t port) { struct mbuf *m; int iphlen; uint32_t vrf_id; uint8_t ecn_bits; struct sockaddr_in6 src, dst; struct ip6_hdr *ip6; struct sctphdr *sh; struct sctp_chunkhdr *ch; int length, offset; uint8_t compute_crc; uint32_t mflowid; uint8_t mflowtype; uint16_t fibnum; iphlen = *offp; if (SCTP_GET_PKT_VRFID(*i_pak, vrf_id)) { SCTP_RELEASE_PKT(*i_pak); return (IPPROTO_DONE); } m = SCTP_HEADER_TO_CHAIN(*i_pak); #ifdef SCTP_MBUF_LOGGING /* Log in any input mbufs */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_INPUT); } #endif #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(m); } #endif SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, "sctp6_input(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), (int)m->m_pkthdr.csum_flags, CSUM_BITS); mflowid = m->m_pkthdr.flowid; mflowtype = M_HASHTYPE_GET(m); fibnum = M_GETFIB(m); SCTP_STAT_INCR(sctps_recvpackets); SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); if (m->m_len < offset) { m = m_pullup(m, offset); if (m == NULL) { SCTP_STAT_INCR(sctps_hdrops); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); sh = (struct sctphdr *)(mtod(m, caddr_t)+iphlen); ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr)); offset -= sizeof(struct sctp_chunkhdr); memset(&src, 0, sizeof(struct sockaddr_in6)); src.sin6_family = AF_INET6; src.sin6_len = sizeof(struct sockaddr_in6); src.sin6_port = sh->src_port; src.sin6_addr = ip6->ip6_src; if (in6_setscope(&src.sin6_addr, m->m_pkthdr.rcvif, NULL) != 0) { goto out; } memset(&dst, 0, sizeof(struct sockaddr_in6)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(struct sockaddr_in6); dst.sin6_port = sh->dest_port; dst.sin6_addr = ip6->ip6_dst; if (in6_setscope(&dst.sin6_addr, m->m_pkthdr.rcvif, NULL) != 0) { goto out; } length = ntohs(ip6->ip6_plen) + iphlen; /* Validate mbuf chain length with IP payload length. */ if (SCTP_HEADER_LEN(m) != length) { SCTPDBG(SCTP_DEBUG_INPUT1, "sctp6_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m)); SCTP_STAT_INCR(sctps_hdrops); goto out; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { goto out; } - ecn_bits = ((ntohl(ip6->ip6_flow) >> 20) & 0x000000ff); + ecn_bits = IPV6_TRAFFIC_CLASS(ip6); if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; } sctp_common_input_processing(&m, iphlen, offset, length, (struct sockaddr *)&src, (struct sockaddr *)&dst, sh, ch, compute_crc, ecn_bits, mflowtype, mflowid, fibnum, vrf_id, port); out: if (m) { sctp_m_freem(m); } return (IPPROTO_DONE); } int sctp6_input(struct mbuf **i_pak, int *offp, int proto SCTP_UNUSED) { return (sctp6_input_with_port(i_pak, offp, 0)); } void sctp6_notify(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, uint8_t icmp6_type, uint8_t icmp6_code, uint32_t next_mtu) { int timer_stopped; switch (icmp6_type) { case ICMP6_DST_UNREACH: if ((icmp6_code == ICMP6_DST_UNREACH_NOROUTE) || (icmp6_code == ICMP6_DST_UNREACH_ADMIN) || (icmp6_code == ICMP6_DST_UNREACH_BEYONDSCOPE) || (icmp6_code == ICMP6_DST_UNREACH_ADDR)) { /* Mark the net unreachable. */ if (net->dest_state & SCTP_ADDR_REACHABLE) { /* Ok that destination is not reachable */ net->dest_state &= ~SCTP_ADDR_REACHABLE; net->dest_state &= ~SCTP_ADDR_PF; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED); } } SCTP_TCB_UNLOCK(stcb); break; case ICMP6_PARAM_PROB: /* Treat it like an ABORT. */ if (icmp6_code == ICMP6_PARAMPROB_NEXTHEADER) { sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2); } else { SCTP_TCB_UNLOCK(stcb); } break; case ICMP6_PACKET_TOO_BIG: if (net->dest_state & SCTP_ADDR_NO_PMTUD) { SCTP_TCB_UNLOCK(stcb); break; } if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { timer_stopped = 1; sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1); } else { timer_stopped = 0; } /* Update the path MTU. */ if (net->port) { next_mtu -= sizeof(struct udphdr); } if (net->mtu > next_mtu) { net->mtu = next_mtu; if (net->port) { sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu + sizeof(struct udphdr)); } else { sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu); } } /* Update the association MTU */ if (stcb->asoc.smallest_mtu > next_mtu) { sctp_pathmtu_adjustment(stcb, next_mtu); } /* Finally, start the PMTU timer if it was running before. */ if (timer_stopped) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); } SCTP_TCB_UNLOCK(stcb); break; default: SCTP_TCB_UNLOCK(stcb); break; } } void sctp6_ctlinput(int cmd, struct sockaddr *pktdst, void *d) { struct ip6ctlparam *ip6cp; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct sctphdr sh; struct sockaddr_in6 src, dst; if (pktdst->sa_family != AF_INET6 || pktdst->sa_len != sizeof(struct sockaddr_in6)) { return; } if ((unsigned)cmd >= PRC_NCMDS) { return; } if (PRC_IS_REDIRECT(cmd)) { d = NULL; } else if (inet6ctlerrmap[cmd] == 0) { return; } /* If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; } else { ip6cp = (struct ip6ctlparam *)NULL; } if (ip6cp != NULL) { /* * XXX: We assume that when IPV6 is non NULL, M and OFF are * valid. */ if (ip6cp->ip6c_m == NULL) { return; } /* * Check if we can safely examine the ports and the * verification tag of the SCTP common header. */ if (ip6cp->ip6c_m->m_pkthdr.len < (int32_t)(ip6cp->ip6c_off + offsetof(struct sctphdr, checksum))) { return; } /* Copy out the port numbers and the verification tag. */ memset(&sh, 0, sizeof(sh)); m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t), (caddr_t)&sh); memset(&src, 0, sizeof(struct sockaddr_in6)); src.sin6_family = AF_INET6; src.sin6_len = sizeof(struct sockaddr_in6); src.sin6_port = sh.src_port; src.sin6_addr = ip6cp->ip6c_ip6->ip6_src; if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { return; } memset(&dst, 0, sizeof(struct sockaddr_in6)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(struct sockaddr_in6); dst.sin6_port = sh.dest_port; dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst; if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { return; } inp = NULL; net = NULL; stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, (struct sockaddr *)&src, &inp, &net, 1, SCTP_DEFAULT_VRFID); if ((stcb != NULL) && (net != NULL) && (inp != NULL)) { /* Check the verification tag */ if (ntohl(sh.v_tag) != 0) { /* * This must be the verification tag used * for sending out packets. We don't * consider packets reflecting the * verification tag. */ if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) { SCTP_TCB_UNLOCK(stcb); return; } } else { if (ip6cp->ip6c_m->m_pkthdr.len >= ip6cp->ip6c_off + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) { /* * In this case we can check if we * got an INIT chunk and if the * initiate tag matches. */ uint32_t initiate_tag; uint8_t chunk_type; m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off + sizeof(struct sctphdr), sizeof(uint8_t), (caddr_t)&chunk_type); m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr), sizeof(uint32_t), (caddr_t)&initiate_tag); if ((chunk_type != SCTP_INITIATION) || (ntohl(initiate_tag) != stcb->asoc.my_vtag)) { SCTP_TCB_UNLOCK(stcb); return; } } else { SCTP_TCB_UNLOCK(stcb); return; } } sctp6_notify(inp, stcb, net, ip6cp->ip6c_icmp6->icmp6_type, ip6cp->ip6c_icmp6->icmp6_code, ntohl(ip6cp->ip6c_icmp6->icmp6_mtu)); } else { if ((stcb == NULL) && (inp != NULL)) { /* reduce inp's ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } if (stcb) { SCTP_TCB_UNLOCK(stcb); } } } } /* * this routine can probably be collasped into the one in sctp_userreq.c * since they do the same thing and now we lookup with a sockaddr */ static int sctp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct sctp_inpcb *inp; struct sctp_nets *net; struct sctp_tcb *stcb; int error; uint32_t vrf_id; vrf_id = SCTP_DEFAULT_VRFID; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (req->oldlen != sizeof(struct ucred)) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); stcb = sctp_findassociation_addr_sa(sin6tosa(&addrs[1]), sin6tosa(&addrs[0]), &inp, &net, 1, vrf_id); if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) { if ((inp != NULL) && (stcb == NULL)) { /* reduce ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); goto cred_can_cont; } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT); error = ENOENT; goto out; } SCTP_TCB_UNLOCK(stcb); /* * We use the write lock here, only since in the error leg we need * it. If we used RLOCK, then we would have to * wlock/decr/unlock/rlock. Which in theory could create a hole. * Better to use higher wlock. */ SCTP_INP_WLOCK(inp); cred_can_cont: error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket); if (error) { SCTP_INP_WUNLOCK(inp); goto out; } cru2x(inp->sctp_socket->so_cred, &xuc); SCTP_INP_WUNLOCK(inp); error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); out: return (error); } SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sctp6_getcred, "S,ucred", "Get the ucred of a SCTP6 connection"); /* This is the same as the sctp_abort() could be made common */ static void sctp6_abort(struct socket *so) { struct epoch_tracker et; struct sctp_inpcb *inp; uint32_t flags; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return; } NET_EPOCH_ENTER(et); sctp_must_try_again: flags = inp->sctp_flags; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 17); #endif if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 16); #endif sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_AFTER_CMPSET_OFCLOSE); SOCK_LOCK(so); SCTP_SB_CLEAR(so->so_snd); /* * same for the rcv ones, they are only here for the * accounting/select. */ SCTP_SB_CLEAR(so->so_rcv); /* Now null out the reference, we are completely detached. */ so->so_pcb = NULL; SOCK_UNLOCK(so); } else { flags = inp->sctp_flags; if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { goto sctp_must_try_again; } } NET_EPOCH_EXIT(et); return; } static int sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED) { int error; struct sctp_inpcb *inp; uint32_t vrf_id = SCTP_DEFAULT_VRFID; inp = (struct sctp_inpcb *)so->so_pcb; if (inp != NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace)); if (error) return (error); } error = sctp_inpcb_alloc(so, vrf_id); if (error) return (error); inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_WLOCK(inp); inp->sctp_flags |= SCTP_PCB_FLAGS_BOUND_V6; /* I'm v6! */ inp->ip_inp.inp.inp_vflag |= INP_IPV6; inp->ip_inp.inp.in6p_hops = -1; /* use kernel default */ inp->ip_inp.inp.in6p_cksum = -1; /* just to be sure */ #ifdef INET /* * XXX: ugly!! IPv4 TTL initialization is necessary for an IPv6 * socket as well, because the socket may be bound to an IPv6 * wildcard address, which may match an IPv4-mapped IPv6 address. */ inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); #endif SCTP_INP_WUNLOCK(inp); return (0); } static int sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) { struct sctp_inpcb *inp; int error; u_char vflagsav; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (addr) { switch (addr->sa_family) { #ifdef INET case AF_INET: if (addr->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } break; #endif #ifdef INET6 case AF_INET6: if (addr->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } break; #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } } vflagsav = inp->ip_inp.inp.inp_vflag; inp->ip_inp.inp.inp_vflag &= ~INP_IPV4; inp->ip_inp.inp.inp_vflag |= INP_IPV6; if ((addr != NULL) && (SCTP_IPV6_V6ONLY(inp) == 0)) { switch (addr->sa_family) { #ifdef INET case AF_INET: /* binding v4 addr to v6 socket, so reset flags */ inp->ip_inp.inp.inp_vflag |= INP_IPV4; inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) { inp->ip_inp.inp.inp_vflag |= INP_IPV4; } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->ip_inp.inp.inp_vflag |= INP_IPV4; inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; error = sctp_inpcb_bind(so, (struct sockaddr *)&sin, NULL, p); goto out; } #endif break; } #endif default: break; } } else if (addr != NULL) { struct sockaddr_in6 *sin6_p; /* IPV6_V6ONLY socket */ #ifdef INET if (addr->sa_family == AF_INET) { /* can't bind v4 addr to v6 only socket! */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); error = EINVAL; goto out; } #endif sin6_p = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { /* can't bind v4-mapped addrs either! */ /* NOTE: we don't support SIIT */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); error = EINVAL; goto out; } } error = sctp_inpcb_bind(so, addr, NULL, p); out: if (error != 0) inp->ip_inp.inp.inp_vflag = vflagsav; return (error); } static void sctp6_close(struct socket *so) { sctp_close(so); } /* This could be made common with sctp_detach() since they are identical */ static int sctp6_disconnect(struct socket *so) { return (sctp_disconnect(so)); } int sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p); static int sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { struct sctp_inpcb *inp; #ifdef INET struct sockaddr_in6 *sin6; #endif /* INET */ /* No SPL needed since sctp_output does this */ inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { if (control) { SCTP_RELEASE_PKT(control); control = NULL; } SCTP_RELEASE_PKT(m); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } /* * For the TCP model we may get a NULL addr, if we are a connected * socket thats ok. */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) && (addr == NULL)) { goto connected_type; } if (addr == NULL) { SCTP_RELEASE_PKT(m); if (control) { SCTP_RELEASE_PKT(control); control = NULL; } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EDESTADDRREQ); return (EDESTADDRREQ); } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, we discard datagrams destined to a * v4 addr or v4-mapped addr */ if (addr->sa_family == AF_INET) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; /* convert v4-mapped into v4 addr and send */ in6_sin6_2_sin(&sin, sin6); return (sctp_sendm(so, flags, m, (struct sockaddr *)&sin, control, p)); } #endif /* INET */ connected_type: /* now what about control */ if (control) { if (inp->control) { SCTP_PRINTF("huh? control set?\n"); SCTP_RELEASE_PKT(inp->control); inp->control = NULL; } inp->control = control; } /* Place the data */ if (inp->pkt) { SCTP_BUF_NEXT(inp->pkt_last) = m; inp->pkt_last = m; } else { inp->pkt_last = inp->pkt = m; } if ( /* FreeBSD and MacOSX uses a flag passed */ ((flags & PRUS_MORETOCOME) == 0) ) { /* * note with the current version this code will only be used * by OpenBSD, NetBSD and FreeBSD have methods for * re-defining sosend() to use sctp_sosend(). One can * optionaly switch back to this code (by changing back the * defininitions but this is not advisable. */ struct epoch_tracker et; int ret; NET_EPOCH_ENTER(et); ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags); NET_EPOCH_EXIT(et); inp->pkt = NULL; inp->control = NULL; return (ret); } else { return (0); } } static int sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) { struct epoch_tracker et; uint32_t vrf_id; int error = 0; struct sctp_inpcb *inp; struct sctp_tcb *stcb; #ifdef INET struct sockaddr_in6 *sin6; union sctp_sockstore store; #endif inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET); return (ECONNRESET); /* I made the same as TCP since we are * not setup? */ } if (addr == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } switch (addr->sa_family) { #ifdef INET case AF_INET: if (addr->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } break; #endif #ifdef INET6 case AF_INET6: if (addr->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } break; #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } vrf_id = inp->def_vrf_id; SCTP_ASOC_CREATE_LOCK(inp); SCTP_INP_RLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == SCTP_PCB_FLAGS_UNBOUND) { /* Bind a ephemeral port */ SCTP_INP_RUNLOCK(inp); error = sctp6_bind(so, NULL, p); if (error) { SCTP_ASOC_CREATE_UNLOCK(inp); return (error); } SCTP_INP_RLOCK(inp); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ SCTP_INP_RUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EADDRINUSE); return (EADDRINUSE); } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, ignore connections destined to a v4 * addr or v4-mapped addr */ if (addr->sa_family == AF_INET) { SCTP_INP_RUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { SCTP_INP_RUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* convert v4-mapped into v4 addr */ in6_sin6_2_sin(&store.sin, sin6); addr = &store.sa; } #endif /* INET */ /* Now do we connect? */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); } SCTP_INP_RUNLOCK(inp); } else { SCTP_INP_RUNLOCK(inp); SCTP_INP_WLOCK(inp); SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL); if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } } if (stcb != NULL) { /* Already have or am bring up an association */ SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EALREADY); return (EALREADY); } /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, p, SCTP_INITIALIZE_AUTH_PARAMS); SCTP_ASOC_CREATE_UNLOCK(inp); if (stcb == NULL) { /* Gak! no memory */ return (error); } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; /* Set the connected flag so we can queue data */ soisconnecting(so); } SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); NET_EPOCH_ENTER(et); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); NET_EPOCH_EXIT(et); return (error); } static int sctp6_getaddr(struct socket *so, struct sockaddr **addr) { struct sockaddr_in6 *sin6; struct sctp_inpcb *inp; uint32_t vrf_id; struct sctp_ifa *sctp_ifa; int error; /* * Do the malloc first in case it blocks. */ SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof(*sin6)); if (sin6 == NULL) return (ENOMEM); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET); return (ECONNRESET); } SCTP_INP_RLOCK(inp); sin6->sin6_port = inp->sctp_lport; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* For the bound all case you get back 0 */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { struct sctp_tcb *stcb; struct sockaddr_in6 *sin_a6; struct sctp_nets *net; int fnd; stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT); return (ENOENT); } fnd = 0; sin_a6 = NULL; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sin_a6 = (struct sockaddr_in6 *)&net->ro._l_addr; if (sin_a6 == NULL) /* this will make coverity happy */ continue; if (sin_a6->sin6_family == AF_INET6) { fnd = 1; break; } } if ((!fnd) || (sin_a6 == NULL)) { /* punt */ SCTP_INP_RUNLOCK(inp); SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT); return (ENOENT); } vrf_id = inp->def_vrf_id; sctp_ifa = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); if (sctp_ifa) { sin6->sin6_addr = sctp_ifa->address.sin6.sin6_addr; } } else { /* For the bound all case you get back 0 */ memset(&sin6->sin6_addr, 0, sizeof(sin6->sin6_addr)); } } else { /* Take the first IPv6 address in the list */ struct sctp_laddr *laddr; int fnd = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa->address.sa.sa_family == AF_INET6) { struct sockaddr_in6 *sin_a; sin_a = &laddr->ifa->address.sin6; sin6->sin6_addr = sin_a->sin6_addr; fnd = 1; break; } } if (!fnd) { SCTP_FREE_SONAME(sin6); SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT); return (ENOENT); } } SCTP_INP_RUNLOCK(inp); /* Scoping things for v6 */ if ((error = sa6_recoverscope(sin6)) != 0) { SCTP_FREE_SONAME(sin6); return (error); } (*addr) = (struct sockaddr *)sin6; return (0); } static int sctp6_peeraddr(struct socket *so, struct sockaddr **addr) { struct sockaddr_in6 *sin6; int fnd; struct sockaddr_in6 *sin_a6; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; int error; /* Do the malloc first in case it blocks. */ SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6); if (sin6 == NULL) return (ENOMEM); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); inp = (struct sctp_inpcb *)so->so_pcb; if ((inp == NULL) || ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { /* UDP type and listeners will drop out here */ SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOTCONN); return (ENOTCONN); } SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); } SCTP_INP_RUNLOCK(inp); if (stcb == NULL) { SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET); return (ECONNRESET); } fnd = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sin_a6 = (struct sockaddr_in6 *)&net->ro._l_addr; if (sin_a6->sin6_family == AF_INET6) { fnd = 1; sin6->sin6_port = stcb->rport; sin6->sin6_addr = sin_a6->sin6_addr; break; } } SCTP_TCB_UNLOCK(stcb); if (!fnd) { /* No IPv4 address */ SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT); return (ENOENT); } if ((error = sa6_recoverscope(sin6)) != 0) { SCTP_FREE_SONAME(sin6); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, error); return (error); } *addr = (struct sockaddr *)sin6; return (0); } static int sctp6_in6getaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = sotoinpcb(so); int error; if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } /* allow v6 addresses precedence */ error = sctp6_getaddr(so, nam); #ifdef INET if (error) { struct sockaddr_in6 *sin6; /* try v4 next if v6 failed */ error = sctp_ingetaddr(so, nam); if (error) { return (error); } SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6); if (sin6 == NULL) { SCTP_FREE_SONAME(*nam); return (ENOMEM); } in6_sin_2_v4mapsin6((struct sockaddr_in *)*nam, sin6); SCTP_FREE_SONAME(*nam); *nam = (struct sockaddr *)sin6; } #endif return (error); } static int sctp6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = sotoinpcb(so); int error; if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } /* allow v6 addresses precedence */ error = sctp6_peeraddr(so, nam); #ifdef INET if (error) { struct sockaddr_in6 *sin6; /* try v4 next if v6 failed */ error = sctp_peeraddr(so, nam); if (error) { return (error); } SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6); if (sin6 == NULL) { SCTP_FREE_SONAME(*nam); return (ENOMEM); } in6_sin_2_v4mapsin6((struct sockaddr_in *)*nam, sin6); SCTP_FREE_SONAME(*nam); *nam = (struct sockaddr *)sin6; } #endif return (error); } struct pr_usrreqs sctp6_usrreqs = { .pru_abort = sctp6_abort, .pru_accept = sctp_accept, .pru_attach = sctp6_attach, .pru_bind = sctp6_bind, .pru_connect = sctp6_connect, .pru_control = in6_control, .pru_close = sctp6_close, .pru_detach = sctp6_close, .pru_sopoll = sopoll_generic, .pru_flush = sctp_flush, .pru_disconnect = sctp6_disconnect, .pru_listen = sctp_listen, .pru_peeraddr = sctp6_getpeeraddr, .pru_send = sctp6_send, .pru_shutdown = sctp_shutdown, .pru_sockaddr = sctp6_in6getaddr, .pru_sosend = sctp_sosend, .pru_soreceive = sctp_soreceive }; #endif diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index f71f89187b58..c131f810b0ec 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -1,6725 +1,6725 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002 - 2008 Henning Brauer * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif /* INET6 */ #if defined(SCTP) || defined(SCTP_SUPPORT) #include #endif #include #include #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x /* * Global variables */ /* state tables */ VNET_DEFINE(struct pf_altqqueue, pf_altqs[4]); VNET_DEFINE(struct pf_kpalist, pf_pabuf); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_inactive); VNET_DEFINE(struct pf_kstatus, pf_status); VNET_DEFINE(u_int32_t, ticket_altqs_active); VNET_DEFINE(u_int32_t, ticket_altqs_inactive); VNET_DEFINE(int, altqs_inactive_open); VNET_DEFINE(u_int32_t, ticket_pabuf); VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); #define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) VNET_DEFINE(u_char, pf_tcp_secret[16]); #define V_pf_tcp_secret VNET(pf_tcp_secret) VNET_DEFINE(int, pf_tcp_secret_init); #define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) VNET_DEFINE(int, pf_tcp_iss_off); #define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) VNET_DECLARE(int, pf_vnet_active); #define V_pf_vnet_active VNET(pf_vnet_active) VNET_DEFINE_STATIC(uint32_t, pf_purge_idx); #define V_pf_purge_idx VNET(pf_purge_idx) /* * Queue for pf_intr() sends. */ static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); struct pf_send_entry { STAILQ_ENTRY(pf_send_entry) pfse_next; struct mbuf *pfse_m; enum { PFSE_IP, PFSE_IP6, PFSE_ICMP, PFSE_ICMP6, } pfse_type; struct { int type; int code; int mtu; } icmpopts; }; STAILQ_HEAD(pf_send_head, pf_send_entry); VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue); #define V_pf_sendqueue VNET(pf_sendqueue) static struct mtx pf_sendqueue_mtx; MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF); #define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) #define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) /* * Queue for pf_overload_task() tasks. */ struct pf_overload_entry { SLIST_ENTRY(pf_overload_entry) next; struct pf_addr addr; sa_family_t af; uint8_t dir; struct pf_krule *rule; }; SLIST_HEAD(pf_overload_head, pf_overload_entry); VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue); #define V_pf_overloadqueue VNET(pf_overloadqueue) VNET_DEFINE_STATIC(struct task, pf_overloadtask); #define V_pf_overloadtask VNET(pf_overloadtask) static struct mtx pf_overloadqueue_mtx; MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx, "pf overload/flush queue", MTX_DEF); #define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx) #define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx) VNET_DEFINE(struct pf_krulequeue, pf_unlinked_rules); struct mtx pf_unlnkdrules_mtx; MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules", MTX_DEF); VNET_DEFINE_STATIC(uma_zone_t, pf_sources_z); #define V_pf_sources_z VNET(pf_sources_z) uma_zone_t pf_mtag_z; VNET_DEFINE(uma_zone_t, pf_state_z); VNET_DEFINE(uma_zone_t, pf_state_key_z); VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]); #define PFID_CPUBITS 8 #define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS) #define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT) #define PFID_MAXID (~PFID_CPUMASK) CTASSERT((1 << PFID_CPUBITS) >= MAXCPU); static void pf_src_tree_remove_state(struct pf_state *); static void pf_init_threshold(struct pf_threshold *, u_int32_t, u_int32_t); static void pf_add_threshold(struct pf_threshold *); static int pf_check_threshold(struct pf_threshold *); static void pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *, u_int16_t *, u_int16_t *, struct pf_addr *, u_int16_t, u_int8_t, sa_family_t); static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *); static void pf_change_icmp(struct pf_addr *, u_int16_t *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t *, u_int16_t *, u_int16_t *, u_int16_t *, u_int8_t, sa_family_t); static void pf_send_tcp(struct mbuf *, const struct pf_krule *, sa_family_t, const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, u_int16_t, struct ifnet *); static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_krule *); static void pf_detach_state(struct pf_state *); static int pf_state_key_attach(struct pf_state_key *, struct pf_state_key *, struct pf_state *); static void pf_state_key_detach(struct pf_state *, int); static int pf_state_key_ctor(void *, int, void *, int); static u_int32_t pf_tcp_iss(struct pf_pdesc *); static int pf_test_rule(struct pf_krule **, struct pf_state **, int, struct pfi_kkif *, struct mbuf *, int, struct pf_pdesc *, struct pf_krule **, struct pf_kruleset **, struct inpcb *); static int pf_create_state(struct pf_krule *, struct pf_krule *, struct pf_krule *, struct pf_pdesc *, struct pf_ksrc_node *, struct pf_state_key *, struct pf_state_key *, struct mbuf *, int, u_int16_t, u_int16_t, int *, struct pfi_kkif *, struct pf_state **, int, u_int16_t, u_int16_t, int); static int pf_test_fragment(struct pf_krule **, int, struct pfi_kkif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_krule **, struct pf_kruleset **); static int pf_tcp_track_full(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pfi_kkif *, struct mbuf *, int, struct pf_pdesc *, u_short *, int *); static int pf_tcp_track_sloppy(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pf_pdesc *, u_short *); static int pf_test_state_tcp(struct pf_state **, int, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_udp(struct pf_state **, int, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *); static int pf_test_state_icmp(struct pf_state **, int, struct pfi_kkif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_other(struct pf_state **, int, struct pfi_kkif *, struct mbuf *, struct pf_pdesc *); static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, int, u_int16_t); static int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); static void pf_print_state_parts(struct pf_state *, struct pf_state_key *, struct pf_state_key *); static int pf_addr_wrap_neq(struct pf_addr_wrap *, struct pf_addr_wrap *); static void pf_patch_8(struct mbuf *, u_int16_t *, u_int8_t *, u_int8_t, bool, u_int8_t); static struct pf_state *pf_find_state(struct pfi_kkif *, struct pf_state_key_cmp *, u_int); static int pf_src_connlimit(struct pf_state **); static void pf_overload_task(void *v, int pending); static int pf_insert_src_node(struct pf_ksrc_node **, struct pf_krule *, struct pf_addr *, sa_family_t); static u_int pf_purge_expired_states(u_int, int); static void pf_purge_unlinked_rules(void); static int pf_mtag_uminit(void *, int, int); static void pf_mtag_free(struct m_tag *); #ifdef INET static void pf_route(struct mbuf **, struct pf_krule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *, struct inpcb *); #endif /* INET */ #ifdef INET6 static void pf_change_a6(struct pf_addr *, u_int16_t *, struct pf_addr *, u_int8_t); static void pf_route6(struct mbuf **, struct pf_krule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *, struct inpcb *); #endif /* INET6 */ int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); extern int pf_end_threads; extern struct proc *pf_purge_proc; VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ (pd)->pf_mtag->flags & PF_PACKET_LOOPED) #define STATE_LOOKUP(i, k, d, s, pd) \ do { \ (s) = pf_find_state((i), (k), (d)); \ if ((s) == NULL) \ return (PF_DROP); \ if (PACKET_LOOPED(pd)) \ return (PF_PASS); \ if ((d) == PF_OUT && \ (((s)->rule.ptr->rt == PF_ROUTETO && \ (s)->rule.ptr->direction == PF_OUT) || \ ((s)->rule.ptr->rt == PF_REPLYTO && \ (s)->rule.ptr->direction == PF_IN)) && \ (s)->rt_kif != NULL && \ (s)->rt_kif != (i)) \ return (PF_PASS); \ } while (0) #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all #define STATE_INC_COUNTERS(s) \ do { \ counter_u64_add(s->rule.ptr->states_cur, 1); \ counter_u64_add(s->rule.ptr->states_tot, 1); \ if (s->anchor.ptr != NULL) { \ counter_u64_add(s->anchor.ptr->states_cur, 1); \ counter_u64_add(s->anchor.ptr->states_tot, 1); \ } \ if (s->nat_rule.ptr != NULL) { \ counter_u64_add(s->nat_rule.ptr->states_cur, 1);\ counter_u64_add(s->nat_rule.ptr->states_tot, 1);\ } \ } while (0) #define STATE_DEC_COUNTERS(s) \ do { \ if (s->nat_rule.ptr != NULL) \ counter_u64_add(s->nat_rule.ptr->states_cur, -1);\ if (s->anchor.ptr != NULL) \ counter_u64_add(s->anchor.ptr->states_cur, -1); \ counter_u64_add(s->rule.ptr->states_cur, -1); \ } while (0) MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(struct pf_srchash *, pf_srchash); SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "pf(4)"); u_long pf_hashmask; u_long pf_srchashmask; static u_long pf_hashsize; static u_long pf_srchashsize; u_long pf_ioctl_maxcount = 65535; SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, &pf_hashsize, 0, "Size of pf(4) states hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN, &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call"); VNET_DEFINE(void *, pf_swi_cookie); VNET_DEFINE(uint32_t, pf_hashseed); #define V_pf_hashseed VNET(pf_hashseed) int pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->addr32[3] > b->addr32[3]) return (1); if (a->addr32[3] < b->addr32[3]) return (-1); if (a->addr32[2] > b->addr32[2]) return (1); if (a->addr32[2] < b->addr32[2]) return (-1); if (a->addr32[1] > b->addr32[1]) return (1); if (a->addr32[1] < b->addr32[1]) return (-1); if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET6 */ default: panic("%s: unknown address family %u", __func__, af); } return (0); } static __inline uint32_t pf_hashkey(struct pf_state_key *sk) { uint32_t h; h = murmur3_32_hash32((uint32_t *)sk, sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), V_pf_hashseed); return (h & pf_hashmask); } static __inline uint32_t pf_hashsrc(struct pf_addr *addr, sa_family_t af) { uint32_t h; switch (af) { case AF_INET: h = murmur3_32_hash32((uint32_t *)&addr->v4, sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed); break; case AF_INET6: h = murmur3_32_hash32((uint32_t *)&addr->v6, sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed); break; default: panic("%s: unknown address family %u", __func__, af); } return (h & pf_srchashmask); } #ifdef ALTQ static int pf_state_hash(struct pf_state *s) { u_int32_t hv = (intptr_t)s / sizeof(*s); hv ^= crc32(&s->src, sizeof(s->src)); hv ^= crc32(&s->dst, sizeof(s->dst)); if (hv == 0) hv = 1; return (hv); } #endif #ifdef INET6 void pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: dst->addr32[0] = src->addr32[0]; break; #endif /* INET */ case AF_INET6: dst->addr32[0] = src->addr32[0]; dst->addr32[1] = src->addr32[1]; dst->addr32[2] = src->addr32[2]; dst->addr32[3] = src->addr32[3]; break; } } #endif /* INET6 */ static void pf_init_threshold(struct pf_threshold *threshold, u_int32_t limit, u_int32_t seconds) { threshold->limit = limit * PF_THRESHOLD_MULT; threshold->seconds = seconds; threshold->count = 0; threshold->last = time_uptime; } static void pf_add_threshold(struct pf_threshold *threshold) { u_int32_t t = time_uptime, diff = t - threshold->last; if (diff >= threshold->seconds) threshold->count = 0; else threshold->count -= threshold->count * diff / threshold->seconds; threshold->count += PF_THRESHOLD_MULT; threshold->last = t; } static int pf_check_threshold(struct pf_threshold *threshold) { return (threshold->count > threshold->limit); } static int pf_src_connlimit(struct pf_state **state) { struct pf_overload_entry *pfoe; int bad = 0; PF_STATE_LOCK_ASSERT(*state); (*state)->src_node->conn++; (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); if ((*state)->rule.ptr->max_src_conn && (*state)->rule.ptr->max_src_conn < (*state)->src_node->conn) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1); bad++; } if ((*state)->rule.ptr->max_src_conn_rate.limit && pf_check_threshold(&(*state)->src_node->conn_rate)) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1); bad++; } if (!bad) return (0); /* Kill this state. */ (*state)->timeout = PFTM_PURGE; (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; if ((*state)->rule.ptr->overload_tbl == NULL) return (1); /* Schedule overloading and flushing task. */ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); if (pfoe == NULL) return (1); /* too bad :( */ bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); pfoe->af = (*state)->key[PF_SK_WIRE]->af; pfoe->rule = (*state)->rule.ptr; pfoe->dir = (*state)->direction; PF_OVERLOADQ_LOCK(); SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); PF_OVERLOADQ_UNLOCK(); taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); return (1); } static void pf_overload_task(void *v, int pending) { struct pf_overload_head queue; struct pfr_addr p; struct pf_overload_entry *pfoe, *pfoe1; uint32_t killed = 0; CURVNET_SET((struct vnet *)v); PF_OVERLOADQ_LOCK(); queue = V_pf_overloadqueue; SLIST_INIT(&V_pf_overloadqueue); PF_OVERLOADQ_UNLOCK(); bzero(&p, sizeof(p)); SLIST_FOREACH(pfoe, &queue, next) { counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("%s: blocking address ", __func__); pf_print_host(&pfoe->addr, 0, pfoe->af); printf("\n"); } p.pfra_af = pfoe->af; switch (pfoe->af) { #ifdef INET case AF_INET: p.pfra_net = 32; p.pfra_ip4addr = pfoe->addr.v4; break; #endif #ifdef INET6 case AF_INET6: p.pfra_net = 128; p.pfra_ip6addr = pfoe->addr.v6; break; #endif } PF_RULES_WLOCK(); pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second); PF_RULES_WUNLOCK(); } /* * Remove those entries, that don't need flushing. */ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) if (pfoe->rule->flush == 0) { SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next); free(pfoe, M_PFTEMP); } else counter_u64_add( V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1); /* If nothing to flush, return. */ if (SLIST_EMPTY(&queue)) { CURVNET_RESTORE(); return; } for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state_key *sk; struct pf_state *s; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { sk = s->key[PF_SK_WIRE]; SLIST_FOREACH(pfoe, &queue, next) if (sk->af == pfoe->af && ((pfoe->rule->flush & PF_FLUSH_GLOBAL) || pfoe->rule == s->rule.ptr) && ((pfoe->dir == PF_OUT && PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) || (pfoe->dir == PF_IN && PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) { s->timeout = PFTM_PURGE; s->src.state = s->dst.state = TCPS_CLOSED; killed++; } } PF_HASHROW_UNLOCK(ih); } SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) free(pfoe, M_PFTEMP); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: %u states killed", __func__, killed); CURVNET_RESTORE(); } /* * Can return locked on failure, so that we can consistently * allocate and insert a new one. */ struct pf_ksrc_node * pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af, int returnlocked) { struct pf_srchash *sh; struct pf_ksrc_node *n; counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_LOCK(sh); LIST_FOREACH(n, &sh->nodes, entry) if (n->rule.ptr == rule && n->af == af && ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) break; if (n != NULL) { n->states++; PF_HASHROW_UNLOCK(sh); } else if (returnlocked == 0) PF_HASHROW_UNLOCK(sh); return (n); } static void pf_free_src_node(struct pf_ksrc_node *sn) { for (int i = 0; i < 2; i++) { if (sn->bytes[i]) counter_u64_free(sn->bytes[i]); if (sn->packets[i]) counter_u64_free(sn->packets[i]); } uma_zfree(V_pf_sources_z, sn); } static int pf_insert_src_node(struct pf_ksrc_node **sn, struct pf_krule *rule, struct pf_addr *src, sa_family_t af) { KASSERT((rule->rule_flag & PFRULE_SRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR), ("%s for non-tracking rule %p", __func__, rule)); if (*sn == NULL) *sn = pf_find_src_node(src, rule, af, 1); if (*sn == NULL) { struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_ASSERT(sh); if (!rule->max_src_nodes || counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes) (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); else counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1); if ((*sn) == NULL) { PF_HASHROW_UNLOCK(sh); return (-1); } for (int i = 0; i < 2; i++) { (*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT); (*sn)->packets[i] = counter_u64_alloc(M_NOWAIT); if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) { pf_free_src_node(*sn); PF_HASHROW_UNLOCK(sh); return (-1); } } pf_init_threshold(&(*sn)->conn_rate, rule->max_src_conn_rate.limit, rule->max_src_conn_rate.seconds); (*sn)->af = af; (*sn)->rule.ptr = rule; PF_ACPY(&(*sn)->addr, src, af); LIST_INSERT_HEAD(&sh->nodes, *sn, entry); (*sn)->creation = time_uptime; (*sn)->ruletype = rule->action; (*sn)->states = 1; if ((*sn)->rule.ptr != NULL) counter_u64_add((*sn)->rule.ptr->src_nodes, 1); PF_HASHROW_UNLOCK(sh); counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); } else { if (rule->max_src_states && (*sn)->states >= rule->max_src_states) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], 1); return (-1); } } return (0); } void pf_unlink_src_node(struct pf_ksrc_node *src) { PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]); LIST_REMOVE(src, entry); if (src->rule.ptr) counter_u64_add(src->rule.ptr->src_nodes, -1); } u_int pf_free_src_nodes(struct pf_ksrc_node_list *head) { struct pf_ksrc_node *sn, *tmp; u_int count = 0; LIST_FOREACH_SAFE(sn, head, entry, tmp) { pf_free_src_node(sn); count++; } counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count); return (count); } void pf_mtag_initialize() { pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL, UMA_ALIGN_PTR, 0); } /* Per-vnet data storage structures initialization. */ void pf_initialize() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; u_int i; if (pf_hashsize == 0 || !powerof2(pf_hashsize)) pf_hashsize = PF_HASHSIZ; if (pf_srchashsize == 0 || !powerof2(pf_srchashsize)) pf_srchashsize = PF_SRCHASHSIZ; V_pf_hashseed = arc4random(); /* States and state keys storage. */ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); uma_zone_set_warning(V_pf_state_z, "PF states limit reached"); V_pf_state_key_z = uma_zcreate("pf state keys", sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_NOWAIT | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_keyhash == NULL || V_pf_idhash == NULL) { printf("pf: Unable to allocate memory for " "state_hashsize %lu.\n", pf_hashsize); free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); pf_hashsize = PF_HASHSIZ; V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO); } pf_hashmask = pf_hashsize - 1; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); } /* Source nodes. */ V_pf_sources_z = uma_zcreate("pf source nodes", sizeof(struct pf_ksrc_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached"); V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_srchash == NULL) { printf("pf: Unable to allocate memory for " "source_hashsize %lu.\n", pf_srchashsize); pf_srchashsize = PF_SRCHASHSIZ; V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO); } pf_srchashmask = pf_srchashsize - 1; for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); TAILQ_INIT(&V_pf_altqs[2]); TAILQ_INIT(&V_pf_altqs[3]); TAILQ_INIT(&V_pf_pabuf); V_pf_altqs_active = &V_pf_altqs[0]; V_pf_altq_ifs_active = &V_pf_altqs[1]; V_pf_altqs_inactive = &V_pf_altqs[2]; V_pf_altq_ifs_inactive = &V_pf_altqs[3]; /* Send & overload+flush queues. */ STAILQ_INIT(&V_pf_sendqueue); SLIST_INIT(&V_pf_overloadqueue); TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet); /* Unlinked, but may be referenced rules. */ TAILQ_INIT(&V_pf_unlinked_rules); } void pf_mtag_cleanup() { uma_zdestroy(pf_mtag_z); } void pf_cleanup() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; struct pf_send_entry *pfse, *next; u_int i; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", __func__)); KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", __func__)); mtx_destroy(&kh->lock); mtx_destroy(&ih->lock); } free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { KASSERT(LIST_EMPTY(&sh->nodes), ("%s: source node hash not empty", __func__)); mtx_destroy(&sh->lock); } free(V_pf_srchash, M_PFHASH); STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { m_freem(pfse->pfse_m); free(pfse, M_PFTEMP); } uma_zdestroy(V_pf_sources_z); uma_zdestroy(V_pf_state_z); uma_zdestroy(V_pf_state_key_z); } static int pf_mtag_uminit(void *mem, int size, int how) { struct m_tag *t; t = (struct m_tag *)mem; t->m_tag_cookie = MTAG_ABI_COMPAT; t->m_tag_id = PACKET_TAG_PF; t->m_tag_len = sizeof(struct pf_mtag); t->m_tag_free = pf_mtag_free; return (0); } static void pf_mtag_free(struct m_tag *t) { uma_zfree(pf_mtag_z, t); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) return ((struct pf_mtag *)(mtag + 1)); mtag = uma_zalloc(pf_mtag_z, M_NOWAIT); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof(struct pf_mtag)); m_tag_prepend(m, mtag); return ((struct pf_mtag *)(mtag + 1)); } static int pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_keyhash *khs, *khw, *kh; struct pf_state_key *sk, *cur; struct pf_state *si, *olds = NULL; int idx; KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); /* * We need to lock hash slots of both keys. To avoid deadlock * we always lock the slot with lower address first. Unlock order * isn't important. * * We also need to lock ID hash slot before dropping key * locks. On success we return with ID hash slot locked. */ if (skw == sks) { khs = khw = &V_pf_keyhash[pf_hashkey(skw)]; PF_HASHROW_LOCK(khs); } else { khs = &V_pf_keyhash[pf_hashkey(sks)]; khw = &V_pf_keyhash[pf_hashkey(skw)]; if (khs == khw) { PF_HASHROW_LOCK(khs); } else if (khs < khw) { PF_HASHROW_LOCK(khs); PF_HASHROW_LOCK(khw); } else { PF_HASHROW_LOCK(khw); PF_HASHROW_LOCK(khs); } } #define KEYS_UNLOCK() do { \ if (khs != khw) { \ PF_HASHROW_UNLOCK(khs); \ PF_HASHROW_UNLOCK(khw); \ } else \ PF_HASHROW_UNLOCK(khs); \ } while (0) /* * First run: start with wire key. */ sk = skw; kh = khw; idx = PF_SK_WIRE; keyattach: LIST_FOREACH(cur, &kh->keys, entry) if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) break; if (cur != NULL) { /* Key exists. Check for same kif, if none, add to key. */ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; PF_HASHROW_LOCK(ih); if (si->kif == s->kif && si->direction == s->direction) { if (sk->proto == IPPROTO_TCP && si->src.state >= TCPS_FIN_WAIT_2 && si->dst.state >= TCPS_FIN_WAIT_2) { /* * New state matches an old >FIN_WAIT_2 * state. We can't drop key hash locks, * thus we can't unlink it properly. * * As a workaround we drop it into * TCPS_CLOSED state, schedule purge * ASAP and push it into the very end * of the slot TAILQ, so that it won't * conflict with our new state. */ si->src.state = si->dst.state = TCPS_CLOSED; si->timeout = PFTM_PURGE; olds = si; } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: %s key attach " "failed on %s: ", (idx == PF_SK_WIRE) ? "wire" : "stack", s->kif->pfik_name); pf_print_state_parts(s, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf(", existing: "); pf_print_state_parts(si, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf("\n"); } PF_HASHROW_UNLOCK(ih); KEYS_UNLOCK(); uma_zfree(V_pf_state_key_z, sk); if (idx == PF_SK_STACK) pf_detach_state(s); return (EEXIST); /* collision! */ } } PF_HASHROW_UNLOCK(ih); } uma_zfree(V_pf_state_key_z, sk); s->key[idx] = cur; } else { LIST_INSERT_HEAD(&kh->keys, sk, entry); s->key[idx] = sk; } stateattach: /* List is sorted, if-bound states before floating. */ if (s->kif == V_pfi_all) TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); else TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); if (olds) { TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]); TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds, key_list[idx]); olds = NULL; } /* * Attach done. See how should we (or should not?) * attach a second key. */ if (sks == skw) { s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; idx = PF_SK_STACK; sks = NULL; goto stateattach; } else if (sks != NULL) { /* * Continue attaching with stack key. */ sk = sks; kh = khs; idx = PF_SK_STACK; sks = NULL; goto keyattach; } PF_STATE_LOCK(s); KEYS_UNLOCK(); KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, ("%s failure", __func__)); return (0); #undef KEYS_UNLOCK } static void pf_detach_state(struct pf_state *s) { struct pf_state_key *sks = s->key[PF_SK_STACK]; struct pf_keyhash *kh; if (sks != NULL) { kh = &V_pf_keyhash[pf_hashkey(sks)]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_STACK] != NULL) pf_state_key_detach(s, PF_SK_STACK); /* * If both point to same key, then we are done. */ if (sks == s->key[PF_SK_WIRE]) { pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); return; } PF_HASHROW_UNLOCK(kh); } if (s->key[PF_SK_WIRE] != NULL) { kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_WIRE] != NULL) pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); } } static void pf_state_key_detach(struct pf_state *s, int idx) { struct pf_state_key *sk = s->key[idx]; #ifdef INVARIANTS struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; PF_HASHROW_ASSERT(kh); #endif TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); s->key[idx] = NULL; if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { LIST_REMOVE(sk, entry); uma_zfree(V_pf_state_key_z, sk); } } static int pf_state_key_ctor(void *mem, int size, void *arg, int flags) { struct pf_state_key *sk = mem; bzero(sk, sizeof(struct pf_state_key_cmp)); TAILQ_INIT(&sk->states[PF_SK_WIRE]); TAILQ_INIT(&sk->states[PF_SK_STACK]); return (0); } struct pf_state_key * pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); sk->port[pd->sidx] = sport; sk->port[pd->didx] = dport; sk->proto = pd->proto; sk->af = pd->af; return (sk); } struct pf_state_key * pf_state_key_clone(struct pf_state_key *orig) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); return (sk); } int pf_state_insert(struct pfi_kkif *kif, struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_idhash *ih; struct pf_state *cur; int error; KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), ("%s: sks not pristine", __func__)); KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), ("%s: skw not pristine", __func__)); KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); s->kif = kif; if (s->id == 0 && s->creatorid == 0) { /* XXX: should be atomic, but probability of collision low */ if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID) V_pf_stateid[curcpu] = 1; s->id |= (uint64_t )curcpu << PFID_CPUSHIFT; s->id = htobe64(s->id); s->creatorid = V_pf_status.hostid; } /* Returns with ID locked on success. */ if ((error = pf_state_key_attach(skw, sks, s)) != 0) return (error); ih = &V_pf_idhash[PF_IDHASH(s)]; PF_HASHROW_ASSERT(ih); LIST_FOREACH(cur, &ih->states, entry) if (cur->id == s->id && cur->creatorid == s->creatorid) break; if (cur != NULL) { PF_HASHROW_UNLOCK(ih); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state ID collision: " "id: %016llx creatorid: %08x\n", (unsigned long long)be64toh(s->id), ntohl(s->creatorid)); } pf_detach_state(s); return (EEXIST); } LIST_INSERT_HEAD(&ih->states, s, entry); /* One for keys, one for ID hash. */ refcount_init(&s->refs, 2); counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1); if (V_pfsync_insert_state_ptr != NULL) V_pfsync_insert_state_ptr(s); /* Returns locked. */ return (0); } /* * Find state by ID: returns with locked row on success. */ struct pf_state * pf_find_state_byid(uint64_t id, uint32_t creatorid) { struct pf_idhash *ih; struct pf_state *s; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))]; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) if (s->id == id && s->creatorid == creatorid) break; if (s == NULL) PF_HASHROW_UNLOCK(ih); return (s); } /* * Find state by key. * Returns with ID hash slot locked on success. */ static struct pf_state * pf_find_state(struct pfi_kkif *kif, struct pf_state_key_cmp *key, u_int dir) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s; int idx; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); /* List is sorted, if-bound states before floating ones. */ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) if (s->kif == V_pfi_all || s->kif == kif) { PF_STATE_LOCK(s); PF_HASHROW_UNLOCK(kh); if (s->timeout >= PFTM_MAX) { /* * State is either being processed by * pf_unlink_state() in an other thread, or * is scheduled for immediate expiry. */ PF_STATE_UNLOCK(s); return (NULL); } return (s); } PF_HASHROW_UNLOCK(kh); return (NULL); } struct pf_state * pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s, *ret = NULL; int idx, inout = 0; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } switch (dir) { case PF_IN: idx = PF_SK_WIRE; break; case PF_OUT: idx = PF_SK_STACK; break; case PF_INOUT: idx = PF_SK_WIRE; inout = 1; break; default: panic("%s: dir %u", __func__, dir); } second_run: TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { if (more == NULL) { PF_HASHROW_UNLOCK(kh); return (s); } if (ret) (*more)++; else ret = s; } if (inout == 1) { inout = 0; idx = PF_SK_STACK; goto second_run; } PF_HASHROW_UNLOCK(kh); return (ret); } /* END state table stuff */ static void pf_send(struct pf_send_entry *pfse) { PF_SENDQ_LOCK(); STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); PF_SENDQ_UNLOCK(); swi_sched(V_pf_swi_cookie, 0); } void pf_intr(void *v) { struct epoch_tracker et; struct pf_send_head queue; struct pf_send_entry *pfse, *next; CURVNET_SET((struct vnet *)v); PF_SENDQ_LOCK(); queue = V_pf_sendqueue; STAILQ_INIT(&V_pf_sendqueue); PF_SENDQ_UNLOCK(); NET_EPOCH_ENTER(et); STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { switch (pfse->pfse_type) { #ifdef INET case PFSE_IP: ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); break; case PFSE_ICMP: icmp_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, 0, pfse->icmpopts.mtu); break; #endif /* INET */ #ifdef INET6 case PFSE_IP6: ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, NULL); break; case PFSE_ICMP6: icmp6_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, pfse->icmpopts.mtu); break; #endif /* INET6 */ default: panic("%s: unknown type", __func__); } free(pfse, M_PFTEMP); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } void pf_purge_thread(void *unused __unused) { VNET_ITERATOR_DECL(vnet_iter); sx_xlock(&pf_end_lock); while (pf_end_threads == 0) { sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", hz / 10); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* Wait until V_pf_default_rule is initialized. */ if (V_pf_vnet_active == 0) { CURVNET_RESTORE(); continue; } /* * Process 1/interval fraction of the state * table every run. */ V_pf_purge_idx = pf_purge_expired_states(V_pf_purge_idx, pf_hashmask / (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); /* * Purge other expired types every * PFTM_INTERVAL seconds. */ if (V_pf_purge_idx == 0) { /* * Order is important: * - states and src nodes reference rules * - states and rules reference kifs */ pf_purge_expired_fragments(); pf_purge_expired_src_nodes(); pf_purge_unlinked_rules(); pfi_kkif_purge(); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } pf_end_threads++; sx_xunlock(&pf_end_lock); kproc_exit(0); } void pf_unload_vnet_purge(void) { /* * To cleanse up all kifs and rules we need * two runs: first one clears reference flags, * then pf_purge_expired_states() doesn't * raise them, and then second run frees. */ pf_purge_unlinked_rules(); pfi_kkif_purge(); /* * Now purge everything. */ pf_purge_expired_states(0, pf_hashmask); pf_purge_fragments(UINT_MAX); pf_purge_expired_src_nodes(); /* * Now all kifs & rules should be unreferenced, * thus should be successfully freed. */ pf_purge_unlinked_rules(); pfi_kkif_purge(); } u_int32_t pf_state_expires(const struct pf_state *state) { u_int32_t timeout; u_int32_t start; u_int32_t end; u_int32_t states; /* handle all PFTM_* > PFTM_MAX here */ if (state->timeout == PFTM_PURGE) return (time_uptime); KASSERT(state->timeout != PFTM_UNLINKED, ("pf_state_expires: timeout == PFTM_UNLINKED")); KASSERT((state->timeout < PFTM_MAX), ("pf_state_expires: timeout > PFTM_MAX")); timeout = state->rule.ptr->timeout[state->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; if (start && state->rule.ptr != &V_pf_default_rule) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = counter_u64_fetch(state->rule.ptr->states_cur); } else { start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; states = V_pf_status.states; } if (end && states > start && start < end) { if (states < end) { timeout = (u_int64_t)timeout * (end - states) / (end - start); return (state->expire + timeout); } else return (time_uptime); } return (state->expire + timeout); } void pf_purge_expired_src_nodes() { struct pf_ksrc_node_list freelist; struct pf_srchash *sh; struct pf_ksrc_node *cur, *next; int i; LIST_INIT(&freelist); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { PF_HASHROW_LOCK(sh); LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) if (cur->states == 0 && cur->expire <= time_uptime) { pf_unlink_src_node(cur); LIST_INSERT_HEAD(&freelist, cur, entry); } else if (cur->rule.ptr != NULL) cur->rule.ptr->rule_flag |= PFRULE_REFS; PF_HASHROW_UNLOCK(sh); } pf_free_src_nodes(&freelist); V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z); } static void pf_src_tree_remove_state(struct pf_state *s) { struct pf_ksrc_node *sn; struct pf_srchash *sh; uint32_t timeout; timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ? s->rule.ptr->timeout[PFTM_SRC_NODE] : V_pf_default_rule.timeout[PFTM_SRC_NODE]; if (s->src_node != NULL) { sn = s->src_node; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (s->src.tcp_est) --sn->conn; if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_HASHROW_UNLOCK(sh); } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { sn = s->nat_src_node; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_HASHROW_UNLOCK(sh); } s->src_node = s->nat_src_node = NULL; } /* * Unlink and potentilly free a state. Function may be * called with ID hash row locked, but always returns * unlocked, since it needs to go through key hash locking. */ int pf_unlink_state(struct pf_state *s, u_int flags) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; if ((flags & PF_ENTER_LOCKED) == 0) PF_HASHROW_LOCK(ih); else PF_HASHROW_ASSERT(ih); if (s->timeout == PFTM_UNLINKED) { /* * State is being processed * by pf_unlink_state() in * an other thread. */ PF_HASHROW_UNLOCK(ih); return (0); /* XXXGL: undefined actually */ } if (s->src.state == PF_TCPS_PROXY_DST) { /* XXX wire key the right one? */ pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af, &s->key[PF_SK_WIRE]->addr[1], &s->key[PF_SK_WIRE]->addr[0], s->key[PF_SK_WIRE]->port[1], s->key[PF_SK_WIRE]->port[0], s->src.seqhi, s->src.seqlo + 1, TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL); } LIST_REMOVE(s, entry); pf_src_tree_remove_state(s); if (V_pfsync_delete_state_ptr != NULL) V_pfsync_delete_state_ptr(s); STATE_DEC_COUNTERS(s); s->timeout = PFTM_UNLINKED; PF_HASHROW_UNLOCK(ih); pf_detach_state(s); /* pf_state_insert() initialises refs to 2, so we can never release the * last reference here, only in pf_release_state(). */ (void)refcount_release(&s->refs); return (pf_release_state(s)); } void pf_free_state(struct pf_state *cur) { KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, cur->timeout)); for (int i = 0; i < 2; i++) { if (cur->bytes[i] != NULL) counter_u64_free(cur->bytes[i]); if (cur->packets[i] != NULL) counter_u64_free(cur->packets[i]); } pf_normalize_tcp_cleanup(cur); uma_zfree(V_pf_state_z, cur); counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1); } /* * Called only from pf_purge_thread(), thus serialized. */ static u_int pf_purge_expired_states(u_int i, int maxcheck) { struct pf_idhash *ih; struct pf_state *s; V_pf_status.states = uma_zone_get_cur(V_pf_state_z); /* * Go through hash and unlink states that expire now. */ while (maxcheck > 0) { ih = &V_pf_idhash[i]; /* only take the lock if we expect to do work */ if (!LIST_EMPTY(&ih->states)) { relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (pf_state_expires(s) <= time_uptime) { V_pf_status.states -= pf_unlink_state(s, PF_ENTER_LOCKED); goto relock; } s->rule.ptr->rule_flag |= PFRULE_REFS; if (s->nat_rule.ptr != NULL) s->nat_rule.ptr->rule_flag |= PFRULE_REFS; if (s->anchor.ptr != NULL) s->anchor.ptr->rule_flag |= PFRULE_REFS; s->kif->pfik_flags |= PFI_IFLAG_REFS; if (s->rt_kif) s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } PF_HASHROW_UNLOCK(ih); } /* Return when we hit end of hash. */ if (++i > pf_hashmask) { V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (0); } maxcheck--; } V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (i); } static void pf_purge_unlinked_rules() { struct pf_krulequeue tmpq; struct pf_krule *r, *r1; /* * If we have overloading task pending, then we'd * better skip purging this time. There is a tiny * probability that overloading task references * an already unlinked rule. */ PF_OVERLOADQ_LOCK(); if (!SLIST_EMPTY(&V_pf_overloadqueue)) { PF_OVERLOADQ_UNLOCK(); return; } PF_OVERLOADQ_UNLOCK(); /* * Do naive mark-and-sweep garbage collecting of old rules. * Reference flag is raised by pf_purge_expired_states() * and pf_purge_expired_src_nodes(). * * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, * use a temporary queue. */ TAILQ_INIT(&tmpq); PF_UNLNKDRULES_LOCK(); TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { if (!(r->rule_flag & PFRULE_REFS)) { TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); TAILQ_INSERT_TAIL(&tmpq, r, entries); } else r->rule_flag &= ~PFRULE_REFS; } PF_UNLNKDRULES_UNLOCK(); if (!TAILQ_EMPTY(&tmpq)) { PF_RULES_WLOCK(); TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { TAILQ_REMOVE(&tmpq, r, entries); pf_free_rule(r); } PF_RULES_WUNLOCK(); } } void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { u_int32_t a = ntohl(addr->addr32[0]); printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, (a>>8)&255, a&255); if (p) { p = ntohs(p); printf(":%u", p); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { u_int16_t b; u_int8_t i, curstart, curend, maxstart, maxend; curstart = curend = maxstart = maxend = 255; for (i = 0; i < 8; i++) { if (!addr->addr16[i]) { if (curstart == 255) curstart = i; curend = i; } else { if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } curstart = curend = 255; } } if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } for (i = 0; i < 8; i++) { if (i >= maxstart && i <= maxend) { if (i == 0) printf(":"); if (i == maxend) printf(":"); } else { b = ntohs(addr->addr16[i]); printf("%x", b); if (i < 7) printf(":"); } } if (p) { p = ntohs(p); printf("[%u]", p); } break; } #endif /* INET6 */ } } void pf_print_state(struct pf_state *s) { pf_print_state_parts(s, NULL, NULL); } static void pf_print_state_parts(struct pf_state *s, struct pf_state_key *skwp, struct pf_state_key *sksp) { struct pf_state_key *skw, *sks; u_int8_t proto, dir; /* Do our best to fill these, but they're skipped if NULL */ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); proto = skw ? skw->proto : (sks ? sks->proto : 0); dir = s ? s->direction : 0; switch (proto) { case IPPROTO_IPV4: printf("IPv4"); break; case IPPROTO_IPV6: printf("IPv6"); break; case IPPROTO_TCP: printf("TCP"); break; case IPPROTO_UDP: printf("UDP"); break; case IPPROTO_ICMP: printf("ICMP"); break; case IPPROTO_ICMPV6: printf("ICMPv6"); break; default: printf("%u", proto); break; } switch (dir) { case PF_IN: printf(" in"); break; case PF_OUT: printf(" out"); break; } if (skw) { printf(" wire: "); pf_print_host(&skw->addr[0], skw->port[0], skw->af); printf(" "); pf_print_host(&skw->addr[1], skw->port[1], skw->af); } if (sks) { printf(" stack: "); if (sks != skw) { pf_print_host(&sks->addr[0], sks->port[0], sks->af); printf(" "); pf_print_host(&sks->addr[1], sks->port[1], sks->af); } else printf("-"); } if (s) { if (proto == IPPROTO_TCP) { printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); printf("]"); printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); printf("]"); } printf(" %u:%u", s->src.state, s->dst.state); } } void pf_print_flags(u_int8_t f) { if (f) printf(" "); if (f & TH_FIN) printf("F"); if (f & TH_SYN) printf("S"); if (f & TH_RST) printf("R"); if (f & TH_PUSH) printf("P"); if (f & TH_ACK) printf("A"); if (f & TH_URG) printf("U"); if (f & TH_ECE) printf("E"); if (f & TH_CWR) printf("W"); } #define PF_SET_SKIP_STEPS(i) \ do { \ while (head[i] != cur) { \ head[i]->skip[i].ptr = cur; \ head[i] = TAILQ_NEXT(head[i], entries); \ } \ } while (0) void pf_calc_skip_steps(struct pf_krulequeue *rules) { struct pf_krule *cur, *prev, *head[PF_SKIP_COUNT]; int i; cur = TAILQ_FIRST(rules); prev = cur; for (i = 0; i < PF_SKIP_COUNT; ++i) head[i] = cur; while (cur != NULL) { if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) PF_SET_SKIP_STEPS(PF_SKIP_IFP); if (cur->direction != prev->direction) PF_SET_SKIP_STEPS(PF_SKIP_DIR); if (cur->af != prev->af) PF_SET_SKIP_STEPS(PF_SKIP_AF); if (cur->proto != prev->proto) PF_SET_SKIP_STEPS(PF_SKIP_PROTO); if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); if (cur->src.port[0] != prev->src.port[0] || cur->src.port[1] != prev->src.port[1] || cur->src.port_op != prev->src.port_op) PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); if (cur->dst.port[0] != prev->dst.port[0] || cur->dst.port[1] != prev->dst.port[1] || cur->dst.port_op != prev->dst.port_op) PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); prev = cur; cur = TAILQ_NEXT(cur, entries); } for (i = 0; i < PF_SKIP_COUNT; ++i) PF_SET_SKIP_STEPS(i); } static int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { if (aw1->type != aw2->type) return (1); switch (aw1->type) { case PF_ADDR_ADDRMASK: case PF_ADDR_RANGE: if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6)) return (1); if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6)) return (1); return (0); case PF_ADDR_DYNIFTL: return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (aw1->p.tbl != aw2->p.tbl); default: printf("invalid address type: %d\n", aw1->type); return (1); } } /** * Checksum updates are a little complicated because the checksum in the TCP/UDP * header isn't always a full checksum. In some cases (i.e. output) it's a * pseudo-header checksum, which is a partial checksum over src/dst IP * addresses, protocol number and length. * * That means we have the following cases: * * Input or forwarding: we don't have TSO, the checksum fields are full * checksums, we need to update the checksum whenever we change anything. * * Output (i.e. the checksum is a pseudo-header checksum): * x The field being updated is src/dst address or affects the length of * the packet. We need to update the pseudo-header checksum (note that this * checksum is not ones' complement). * x Some other field is being modified (e.g. src/dst port numbers): We * don't have to update anything. **/ u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { u_int32_t x; x = cksum + old - new; x = (x + (x >> 16)) & 0xffff; /* optimise: eliminate a branch when not udp */ if (udp && cksum == 0x0000) return cksum; if (udp && x == 0x0000) x = 0xffff; return (u_int16_t)(x); } static void pf_patch_8(struct mbuf *m, u_int16_t *cksum, u_int8_t *f, u_int8_t v, bool hi, u_int8_t udp) { u_int16_t old = htons(hi ? (*f << 8) : *f); u_int16_t new = htons(hi ? ( v << 8) : v); if (*f == v) return; *f = v; if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) return; *cksum = pf_cksum_fixup(*cksum, old, new, udp); } void pf_patch_16_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int16_t v, bool hi, u_int8_t udp) { u_int8_t *fb = (u_int8_t *)f; u_int8_t *vb = (u_int8_t *)&v; pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); } void pf_patch_32_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int32_t v, bool hi, u_int8_t udp) { u_int8_t *fb = (u_int8_t *)f; u_int8_t *vb = (u_int8_t *)&v; pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); pf_patch_8(m, cksum, fb++, *vb++, hi, udp); pf_patch_8(m, cksum, fb++, *vb++, !hi, udp); } u_int16_t pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) return (cksum); return (pf_cksum_fixup(cksum, old, new, udp)); } static void pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) { struct pf_addr ao; u_int16_t po = *p; PF_ACPY(&ao, a, af); PF_ACPY(a, an, af); if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) *pc = ~*pc; *p = pn; switch (af) { #ifdef INET case AF_INET: *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; *pc = pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET6 */ } if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { *pc = ~*pc; if (! *pc) *pc = 0xffff; } } /* Changes a u_int32_t. Uses a void * so there are no align restrictions */ void pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), ao % 65536, an % 65536, u); } void pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_proto_cksum_fixup(m, pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp), ao % 65536, an % 65536, udp); } #ifdef INET6 static void pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; PF_ACPY(&ao, a, AF_INET6); PF_ACPY(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*c, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); } #endif /* INET6 */ static void pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) { struct pf_addr oia, ooa; PF_ACPY(&oia, ia, af); if (oa) PF_ACPY(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { u_int16_t oip = *ip; u_int32_t opc; if (pc != NULL) opc = *pc; *ip = np; if (pc != NULL) *pc = pf_cksum_fixup(*pc, oip, *ip, u); *ic = pf_cksum_fixup(*ic, oip, *ip, 0); if (pc != NULL) *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ PF_ACPY(ia, na, af); switch (af) { #ifdef INET case AF_INET: { u_int32_t oh2c = *h2c; *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); break; } #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], u), oia.addr16[1], ia->addr16[1], u), oia.addr16[2], ia->addr16[2], u), oia.addr16[3], ia->addr16[3], u), oia.addr16[4], ia->addr16[4], u), oia.addr16[5], ia->addr16[5], u), oia.addr16[6], ia->addr16[6], u), oia.addr16[7], ia->addr16[7], u); break; #endif /* INET6 */ } /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ if (oa) { PF_ACPY(oa, na, af); switch (af) { #ifdef INET case AF_INET: *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, ooa.addr16[0], oa->addr16[0], 0), ooa.addr16[1], oa->addr16[1], 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, ooa.addr16[0], oa->addr16[0], u), ooa.addr16[1], oa->addr16[1], u), ooa.addr16[2], oa->addr16[2], u), ooa.addr16[3], oa->addr16[3], u), ooa.addr16[4], oa->addr16[4], u), ooa.addr16[5], oa->addr16[5], u), ooa.addr16[6], oa->addr16[6], u), ooa.addr16[7], oa->addr16[7], u); break; #endif /* INET6 */ } } } /* * Need to modulate the sequence numbers in the TCP SACK option * (credits to Krzysztof Pfaff for report and patch) */ static int pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *dst) { int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; u_int8_t opts[TCP_MAXOLEN], *opt = opts; int copyback = 0, i, olen; struct sackblk sack; #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) if (hlen < TCPOLEN_SACKLEN || !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) return 0; while (hlen >= TCPOLEN_SACKLEN) { size_t startoff = opt - opts; olen = opt[1]; switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_SACK: if (olen > hlen) olen = hlen; if (olen >= TCPOLEN_SACKLEN) { for (i = 2; i + TCPOLEN_SACK <= olen; i += TCPOLEN_SACK) { memcpy(&sack, &opt[i], sizeof(sack)); pf_patch_32_unaligned(m, &th->th_sum, &sack.start, htonl(ntohl(sack.start) - dst->seqdiff), PF_ALGNMNT(startoff), 0); pf_patch_32_unaligned(m, &th->th_sum, &sack.end, htonl(ntohl(sack.end) - dst->seqdiff), PF_ALGNMNT(startoff), 0); memcpy(&opt[i], &sack, sizeof(sack)); } copyback = 1; } /* FALLTHROUGH */ default: if (olen < 2) olen = 2; hlen -= olen; opt += olen; } } if (copyback) m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); return (copyback); } static void pf_send_tcp(struct mbuf *replyto, const struct pf_krule *r, sa_family_t af, const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, u_int16_t rtag, struct ifnet *ifp) { struct pf_send_entry *pfse; struct mbuf *m; int len, tlen; #ifdef INET struct ip *h = NULL; #endif /* INET */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif /* INET6 */ struct tcphdr *th; char *opt; struct pf_mtag *pf_mtag; len = 0; th = NULL; /* maximum segment size tcp option */ tlen = sizeof(struct tcphdr); if (mss) tlen += 4; switch (af) { #ifdef INET case AF_INET: len = sizeof(struct ip) + tlen; break; #endif /* INET */ #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr) + tlen; break; #endif /* INET6 */ default: panic("%s: unsupported af %d", __func__, af); } /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { free(pfse, M_PFTEMP); return; } #ifdef MAC mac_netinet_firewall_send(m); #endif if ((pf_mtag = pf_get_mtag(m)) == NULL) { free(pfse, M_PFTEMP); m_freem(m); return; } if (tag) m->m_flags |= M_SKIP_FIREWALL; pf_mtag->tag = rtag; if (r != NULL && r->rtableid >= 0) M_SETFIB(m, r->rtableid); #ifdef ALTQ if (r != NULL && r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m, struct ip *); } #endif /* ALTQ */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (af) { #ifdef INET case AF_INET: h = mtod(m, struct ip *); /* IP header fields included in the TCP checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(tlen); h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); /* IP header fields included in the TCP checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(tlen); memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); break; #endif /* INET6 */ } /* TCP header */ th->th_sport = sport; th->th_dport = dport; th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_off = tlen >> 2; th->th_flags = flags; th->th_win = htons(win); if (mss) { opt = (char *)(th + 1); opt[0] = TCPOPT_MAXSEG; opt[1] = 4; HTONS(mss); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); } switch (af) { #ifdef INET case AF_INET: /* TCP checksum */ th->th_sum = in_cksum(m, len); /* Finish the IP header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); h->ip_len = htons(len); h->ip_ttl = ttl ? ttl : V_ip_defttl; h->ip_sum = 0; pfse->pfse_type = PFSE_IP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* TCP checksum */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; pfse->pfse_type = PFSE_IP6; break; #endif /* INET6 */ } pfse->pfse_m = m; pf_send(pfse); } static void pf_return(struct pf_krule *r, struct pf_krule *nr, struct pf_pdesc *pd, struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th, struct pfi_kkif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, u_short *reason) { struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; /* undo NAT changes, if they have taken place */ if (nr != NULL) { PF_ACPY(saddr, &sk->addr[pd->sidx], af); PF_ACPY(daddr, &sk->addr[pd->didx], af); if (pd->sport) *pd->sport = sk->port[pd->sidx]; if (pd->dport) *pd->dport = sk->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } if (pd->proto == IPPROTO_TCP && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURN)) && !(th->th_flags & TH_RST)) { u_int32_t ack = ntohl(th->th_seq) + pd->p_len; int len = 0; #ifdef INET struct ip *h4; #endif #ifdef INET6 struct ip6_hdr *h6; #endif switch (af) { #ifdef INET case AF_INET: h4 = mtod(m, struct ip *); len = ntohs(h4->ip_len) - off; break; #endif #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); break; #endif } if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) REASON_SET(reason, PFRES_PROTCKSUM); else { if (th->th_flags & TH_SYN) ack++; if (th->th_flags & TH_FIN) ack++; pf_send_tcp(m, r, af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, r->return_ttl, 1, 0, kif->pfik_ifp); } } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } static int pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio) { struct m_tag *mtag; KASSERT(prio <= PF_PRIO_MAX, ("%s with invalid pcp", __func__)); mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) return (ENOMEM); m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = prio; return (0); } static int pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m) { struct m_tag *mtag; u_int8_t mpcp; mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) return (0); if (prio == PF_PRIO_ZERO) prio = 0; mpcp = *(uint8_t *)(mtag + 1); return (mpcp == prio); } static void pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, struct pf_krule *r) { struct pf_send_entry *pfse; struct mbuf *m0; struct pf_mtag *pf_mtag; /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { free(pfse, M_PFTEMP); return; } if ((pf_mtag = pf_get_mtag(m0)) == NULL) { free(pfse, M_PFTEMP); return; } /* XXX: revisit */ m0->m_flags |= M_SKIP_FIREWALL; if (r->rtableid >= 0) M_SETFIB(m0, r->rtableid); #ifdef ALTQ if (r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m0, struct ip *); } #endif /* ALTQ */ switch (af) { #ifdef INET case AF_INET: pfse->pfse_type = PFSE_ICMP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pfse->pfse_type = PFSE_ICMP6; break; #endif /* INET6 */ } pfse->pfse_m = m0; pfse->icmpopts.type = type; pfse->icmpopts.code = code; pf_send(pfse); } /* * Return 1 if the addresses a and b match (with mask m), otherwise return 0. * If n is 0, they match if they are equal. If n is != 0, they match if they * are different. */ int pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, struct pf_addr *b, sa_family_t af) { int match = 0; switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) match++; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) && ((a->addr32[1] & m->addr32[1]) == (b->addr32[1] & m->addr32[1])) && ((a->addr32[2] & m->addr32[2]) == (b->addr32[2] & m->addr32[2])) && ((a->addr32[3] & m->addr32[3]) == (b->addr32[3] & m->addr32[3]))) match++; break; #endif /* INET6 */ } if (match) { if (n) return (0); else return (1); } else { if (n) return (1); else return (0); } } /* * Return 1 if b <= a <= e, otherwise return 0. */ int pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) || (ntohl(a->addr32[0]) > ntohl(e->addr32[0]))) return (0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: { int i; /* check a >= b */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) > ntohl(b->addr32[i])) break; else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i])) return (0); /* check a <= e */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) < ntohl(e->addr32[i])) break; else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i])) return (0); break; } #endif /* INET6 */ } return (1); } static int pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) { switch (op) { case PF_OP_IRG: return ((p > a1) && (p < a2)); case PF_OP_XRG: return ((p < a1) || (p > a2)); case PF_OP_RRG: return ((p >= a1) && (p <= a2)); case PF_OP_EQ: return (p == a1); case PF_OP_NE: return (p != a1); case PF_OP_LT: return (p < a1); case PF_OP_LE: return (p <= a1); case PF_OP_GT: return (p > a1); case PF_OP_GE: return (p >= a1); } return (0); /* never reached */ } int pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) { NTOHS(a1); NTOHS(a2); NTOHS(p); return (pf_match(op, a1, a2, p)); } static int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } static int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } int pf_match_tag(struct mbuf *m, struct pf_krule *r, int *tag, int mtag) { if (*tag == -1) *tag = mtag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } int pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) { KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) return (ENOMEM); pd->pf_mtag->tag = tag; return (0); } #define PF_ANCHOR_STACKSIZE 32 struct pf_kanchor_stackframe { struct pf_kruleset *rs; struct pf_krule *r; /* XXX: + match bit */ struct pf_kanchor *child; }; /* * XXX: We rely on malloc(9) returning pointer aligned addresses. */ #define PF_ANCHORSTACK_MATCH 0x00000001 #define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_RULE(f) (struct pf_krule *) \ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) #define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ } while (0) void pf_step_into_anchor(struct pf_kanchor_stackframe *stack, int *depth, struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a, int *match) { struct pf_kanchor_stackframe *f; PF_RULES_RASSERT(); if (match) *match = 0; if (*depth >= PF_ANCHOR_STACKSIZE) { printf("%s: anchor stack overflow on %s\n", __func__, (*r)->anchor->name); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { struct pf_kanchor_node *parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_kanchor_node, parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); } int pf_step_out_of_anchor(struct pf_kanchor_stackframe *stack, int *depth, struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a, int *match) { struct pf_kanchor_stackframe *f; struct pf_krule *fr; int quick = 0; PF_RULES_RASSERT(); do { if (*depth <= 0) break; f = stack + *depth - 1; fr = PF_ANCHOR_RULE(f); if (f->child != NULL) { struct pf_kanchor_node *parent; /* * This block traverses through * a wildcard anchor. */ parent = &fr->anchor->children; if (match != NULL && *match) { /* * If any of "*" matched, then * "foo/ *" matched, mark frame * appropriately. */ PF_ANCHOR_SET_MATCH(f); *match = 0; } f->child = RB_NEXT(pf_kanchor_node, parent, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (PF_ANCHOR_MATCH(f) || (match != NULL && *match)) quick = fr->quick; *r = TAILQ_NEXT(fr, entries); } while (*r == NULL); return (quick); } #ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; } } void pf_addr_inc(struct pf_addr *addr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; if (addr->addr32[2] == 0xffffffff) { addr->addr32[2] = 0; if (addr->addr32[1] == 0xffffffff) { addr->addr32[1] = 0; addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); } else addr->addr32[1] = htonl(ntohl(addr->addr32[1]) + 1); } else addr->addr32[2] = htonl(ntohl(addr->addr32[2]) + 1); } else addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; } } #endif /* INET6 */ int pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; struct inpcbinfo *pi; struct inpcb *inp; pd->lookup.uid = UID_MAX; pd->lookup.gid = GID_MAX; switch (pd->proto) { case IPPROTO_TCP: if (pd->hdr.tcp == NULL) return (-1); sport = pd->hdr.tcp->th_sport; dport = pd->hdr.tcp->th_dport; pi = &V_tcbinfo; break; case IPPROTO_UDP: if (pd->hdr.udp == NULL) return (-1); sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; pi = &V_udbinfo; break; default: return (-1); } if (direction == PF_IN) { saddr = pd->src; daddr = pd->dst; } else { u_int16_t p; p = sport; sport = dport; dport = p; saddr = pd->dst; daddr = pd->src; } switch (pd->af) { #ifdef INET case AF_INET: inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET6 */ default: return (-1); } INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; INP_RUNLOCK(inp); return (1); } static u_int8_t pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int8_t wscale = 0; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= 3) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_WINDOW: wscale = opt[2]; if (wscale > TCP_MAX_WINSHIFT) wscale = TCP_MAX_WINSHIFT; wscale |= PF_WSCALE_FLAG; /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (wscale); } static u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int16_t mss = V_tcp_mssdflt; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= TCPOLEN_MAXSEG) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_MAXSEG: bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); NTOHS(mss); /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (mss); } static u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) { struct nhop_object *nh; #ifdef INET6 struct in6_addr dst6; uint32_t scopeid; #endif /* INET6 */ int hlen = 0; uint16_t mss = 0; NET_EPOCH_ASSERT(); switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); nh = fib4_lookup(rtableid, addr->v4, 0, 0, 0); if (nh != NULL) mss = nh->nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); in6_splitscope(&addr->v6, &dst6, &scopeid); nh = fib6_lookup(rtableid, &dst6, scopeid, 0, 0); if (nh != NULL) mss = nh->nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET6 */ } mss = max(V_tcp_mssdflt, mss); mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); } static u_int32_t pf_tcp_iss(struct pf_pdesc *pd) { MD5_CTX ctx; u_int32_t digest[4]; if (V_pf_tcp_secret_init == 0) { arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); MD5Init(&V_pf_tcp_secret_ctx); MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); V_pf_tcp_secret_init = 1; } ctx = V_pf_tcp_secret_ctx; MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short)); MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short)); if (pd->af == AF_INET6) { MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); } else { MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); } MD5Final((u_char *)digest, &ctx); V_pf_tcp_iss_off += 4096; #define ISN_RANDOM_INCREMENT (4096 - 1) return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + V_pf_tcp_iss_off); #undef ISN_RANDOM_INCREMENT } static int pf_test_rule(struct pf_krule **rm, struct pf_state **sm, int direction, struct pfi_kkif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_krule **am, struct pf_kruleset **rsm, struct inpcb *inp) { struct pf_krule *nr = NULL; struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; struct pf_krule *r, *a = NULL; struct pf_kruleset *ruleset = NULL; struct pf_ksrc_node *nsn = NULL; struct tcphdr *th = pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; u_short reason; int rewrite = 0, hdrlen = 0; int tag = -1, rtableid = -1; int asd = 0; int match = 0; int state_icmp = 0; u_int16_t sport = 0, dport = 0; u_int16_t bproto_sum = 0, bip_sum = 0; u_int8_t icmptype = 0, icmpcode = 0; struct pf_kanchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; pd->lookup.done = 1; } switch (pd->proto) { case IPPROTO_TCP: sport = th->th_sport; dport = th->th_dport; hdrlen = sizeof(*th); break; case IPPROTO_UDP: sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; hdrlen = sizeof(*pd->hdr.udp); break; #ifdef INET case IPPROTO_ICMP: if (pd->af != AF_INET) break; sport = dport = pd->hdr.icmp->icmp_id; hdrlen = sizeof(*pd->hdr.icmp); icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: if (af != AF_INET6) break; sport = dport = pd->hdr.icmp6->icmp6_id; hdrlen = sizeof(*pd->hdr.icmp6); icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ default: sport = dport = hdrlen = 0; break; } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk, &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { KASSERT(sk != NULL, ("%s: null sk", __func__)); KASSERT(nk != NULL, ("%s: null nk", __func__)); if (pd->ip_sum) bip_sum = *pd->ip_sum; switch (pd->proto) { case IPPROTO_TCP: bproto_sum = th->th_sum; pd->proto_sum = &th->th_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, af); pd->sport = &th->th_sport; sport = th->th_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, af); dport = th->th_dport; pd->dport = &th->th_dport; } rewrite++; break; case IPPROTO_UDP: bproto_sum = pd->hdr.udp->uh_sum; pd->proto_sum = &pd->hdr.udp->uh_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, af); sport = pd->hdr.udp->uh_sport; pd->sport = &pd->hdr.udp->uh_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, af); dport = pd->hdr.udp->uh_dport; pd->dport = &pd->hdr.udp->uh_dport; } rewrite++; break; #ifdef INET case IPPROTO_ICMP: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[1] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, sport, nk->port[1], 0); pd->hdr.icmp->icmp_id = nk->port[1]; pd->sport = &pd->hdr.icmp->icmp_id; } m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); rewrite++; break; #endif /* INET */ default: switch (af) { #ifdef INET case AF_INET: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) PF_ACPY(saddr, &nk->addr[pd->sidx], af); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) PF_ACPY(daddr, &nk->addr[pd->didx], af); break; #endif /* INET */ } break; } if (nr->natpass) r = NULL; pd->nat_rule = nr; } while (r != NULL) { counter_u64_add(r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ else if (r->type && r->type != icmptype + 1) r = TAILQ_NEXT(r, entries); /* icmp only. type always 0 in other cases */ else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); /* tcp/udp only. uid.op always 0 in other cases */ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); /* tcp/udp only. gid.op always 0 in other cases */ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->log)) { if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd, 1); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason); } if (r->action == PF_DROP) goto cleanup; if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } if (rtableid >= 0) M_SETFIB(m, rtableid); if (!state_icmp && (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM))) { int action; action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, hdrlen); if (action != PF_PASS) { if (action == PF_DROP && (r->rule_flag & PFRULE_RETURN)) pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason); return (action); } } else { if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && direction == PF_OUT && V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m)) /* * We want the state created, but we dont * want to send this in case a partner * firewall has to know about it to allow * replies through it. */ return (PF_DEFER); return (PF_PASS); cleanup: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); return (PF_DROP); } static int pf_create_state(struct pf_krule *r, struct pf_krule *nr, struct pf_krule *a, struct pf_pdesc *pd, struct pf_ksrc_node *nsn, struct pf_state_key *nk, struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen) { struct pf_state *s = NULL; struct pf_ksrc_node *sn = NULL; struct tcphdr *th = pd->hdr.tcp; u_int16_t mss = V_tcp_mssdflt; u_short reason; /* check maximums */ if (r->max_states && (counter_u64_fetch(r->states_cur) >= r->max_states)) { counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); REASON_SET(&reason, PFRES_MAXSTATES); goto csfailed; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto csfailed; } for (int i = 0; i < 2; i++) { s->bytes[i] = counter_u64_alloc(M_NOWAIT); s->packets[i] = counter_u64_alloc(M_NOWAIT); if (s->bytes[i] == NULL || s->packets[i] == NULL) { pf_free_state(s); REASON_SET(&reason, PFRES_MEMORY); goto csfailed; } } s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); if (r->allow_opts) s->state_flags |= PFSTATE_ALLOWOPTS; if (r->rule_flag & PFRULE_STATESLOPPY) s->state_flags |= PFSTATE_SLOPPY; s->log = r->log & PF_LOG_ALL; s->sync_state = PFSYNC_S_NONE; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; switch (pd->proto) { case IPPROTO_TCP: s->src.seqlo = ntohl(th->th_seq); s->src.seqhi = s->src.seqlo + pd->p_len + 1; if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_MODULATE) { /* Generate sequence number modulator */ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == 0) s->src.seqdiff = 1; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(s->src.seqlo + s->src.seqdiff), 0); *rewrite = 1; } else s->src.seqdiff = 0; if (th->th_flags & TH_SYN) { s->src.seqhi++; s->src.wscale = pf_get_wscale(m, off, th->th_off, pd->af); } s->src.max_win = MAX(ntohs(th->th_win), 1); if (s->src.wscale & PF_WSCALE_MASK) { /* Remove scale factor from initial window */ int win = s->src.max_win; win += 1 << (s->src.wscale & PF_WSCALE_MASK); s->src.max_win = (win - 1) >> (s->src.wscale & PF_WSCALE_MASK); } if (th->th_flags & TH_FIN) s->src.seqhi++; s->dst.seqhi = 1; s->dst.max_win = 1; s->src.state = TCPS_SYN_SENT; s->dst.state = TCPS_CLOSED; s->timeout = PFTM_TCP_FIRST_PACKET; break; case IPPROTO_UDP: s->src.state = PFUDPS_SINGLE; s->dst.state = PFUDPS_NO_TRAFFIC; s->timeout = PFTM_UDP_FIRST_PACKET; break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif s->timeout = PFTM_ICMP_FIRST_PACKET; break; default: s->src.state = PFOTHERS_SINGLE; s->dst.state = PFOTHERS_NO_TRAFFIC; s->timeout = PFTM_OTHER_FIRST_PACKET; } if (r->rt) { if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) { REASON_SET(&reason, PFRES_MAPFAILED); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); goto csfailed; } s->rt_kif = r->rpool.cur->kif; } s->creation = time_uptime; s->expire = time_uptime; if (sn != NULL) s->src_node = sn; if (nsn != NULL) { /* XXX We only modify one side for now. */ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); s->nat_src_node = nsn; } if (pd->proto == IPPROTO_TCP) { if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { REASON_SET(&reason, PFRES_MEMORY); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, &s->src, &s->dst, rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, ("pf_normalize_tcp_stateful failed on first " "pkt\n")); pf_normalize_tcp_cleanup(s); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } } s->direction = pd->dir; /* * sk/nk could already been setup by pf_get_translation(). */ if (nr == NULL) { KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); if (sk == NULL) goto csfailed; nk = sk; } else KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); /* Swap sk/nk for PF_OUT. */ if (pf_state_insert(BOUND_IFACE(r, kif), (pd->dir == PF_IN) ? sk : nk, (pd->dir == PF_IN) ? nk : sk, s)) { if (pd->proto == IPPROTO_TCP) pf_normalize_tcp_cleanup(s); REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } else *sm = s; if (tag > 0) s->tag = tag; if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { s->src.state = PF_TCPS_PROXY_SRC; /* undo NAT changes, if they have taken place */ if (nr != NULL) { struct pf_state_key *skt = s->key[PF_SK_WIRE]; if (pd->dir == PF_OUT) skt = s->key[PF_SK_STACK]; PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); if (pd->sport) *pd->sport = skt->port[pd->sidx]; if (pd->dport) *pd->dport = skt->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } s->src.seqhi = htonl(arc4random()); /* Find mss option */ int rtid = M_GETFIB(m); mss = pf_get_mss(m, off, th->th_off, pd->af); mss = pf_calc_mss(pd->src, pd->af, rtid, mss); mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); s->src.mss = mss; pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL); REASON_SET(&reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } return (PF_PASS); csfailed: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); if (sn != NULL) { struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (--sn->states == 0 && sn->expire == 0) { pf_unlink_src_node(sn); uma_zfree(V_pf_sources_z, sn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_HASHROW_UNLOCK(sh); } if (nsn != sn && nsn != NULL) { struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)]; PF_HASHROW_LOCK(sh); if (--nsn->states == 0 && nsn->expire == 0) { pf_unlink_src_node(nsn); uma_zfree(V_pf_sources_z, nsn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_HASHROW_UNLOCK(sh); } return (PF_DROP); } static int pf_test_fragment(struct pf_krule **rm, int direction, struct pfi_kkif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_krule **am, struct pf_kruleset **rsm) { struct pf_krule *r, *a = NULL; struct pf_kruleset *ruleset = NULL; sa_family_t af = pd->af; u_short reason; int tag = -1; int asd = 0; int match = 0; struct pf_kanchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); while (r != NULL) { counter_u64_add(r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->src.port_op || r->dst.port_op || r->flagset)) r = TAILQ_NEXT(r, entries); else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= (arc4random() % (UINT_MAX - 1) + 1)) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else { if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log) PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd, 1); if (r->action != PF_PASS) return (PF_DROP); if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pfi_kkif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, int *copyback) { struct tcphdr *th = pd->hdr.tcp; u_int16_t win = ntohs(th->th_win); u_int32_t ack, end, seq, orig_seq; u_int8_t sws, dws; int ackskew; if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { sws = src->wscale & PF_WSCALE_MASK; dws = dst->wscale & PF_WSCALE_MASK; } else sws = dws = 0; /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(th->th_seq); if (src->seqlo == 0) { /* First packet from this end. Set its state */ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && src->scrub == NULL) { if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } } /* Deferred generation of sequence number modulator */ if (dst->seqdiff && !src->seqdiff) { /* use random iss for the TCP server */ while ((src->seqdiff = arc4random() - seq) == 0) ; ack = ntohl(th->th_ack) - dst->seqdiff; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } else { ack = ntohl(th->th_ack); } end = seq + pd->p_len; if (th->th_flags & TH_SYN) { end++; if (dst->wscale & PF_WSCALE_FLAG) { src->wscale = pf_get_wscale(m, off, th->th_off, pd->af); if (src->wscale & PF_WSCALE_FLAG) { /* Remove scale factor from initial * window */ sws = src->wscale & PF_WSCALE_MASK; win = ((u_int32_t)win + (1 << sws) - 1) >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (th->th_flags & TH_FIN) end++; src->seqlo = seq; if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); if (win > src->max_win) src->max_win = win; } else { ack = ntohl(th->th_ack) - dst->seqdiff; if (src->seqdiff) { /* Modulate sequence numbers */ pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; if (th->th_flags & TH_FIN) end++; } if ((th->th_flags & TH_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || /* broken tcp stacks do not set ack */ (dst->state < TCPS_SYN_SENT)) { /* * Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; /* * Need to demodulate the sequence numbers in any TCP SACK options * (Selective ACK). We could optionally validate the SACK values * against the current ACK window, either forwards or backwards, but * I'm not confident that SACK has been implemented properly * everywhere. It wouldn't surprise me if several stacks accidentally * SACK too far backwards of previously ACKed data. There really aren't * any security implications of bad SACKing unless the target stack * doesn't validate the option length correctly. Someone trying to * spoof into a TCP connection won't bother blindly sending SACK * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { if (pf_modulate_sack(m, off, pd, th, dst)) *copyback = 1; } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ (ackskew <= (MAXACKWINDOW << sws)) && /* Acking not more than one window forward */ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || (pd->flags & PFDESC_IP_REAS) == 0)) { /* Require an exact/+1 sequence match on resets when possible */ if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* update states */ if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) dst->state = TCPS_FIN_WAIT_2; } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; /* Fall through to PASS packet */ } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: loose state match: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)counter_u64_fetch((*state)->packets[0]), (unsigned long long)counter_u64_fetch((*state)->packets[1]), pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); } if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* Fall through to PASS packet */ } else { if ((*state)->dst.state == TCPS_SYN_SENT && (*state)->src.state == TCPS_SYN_SENT) { /* Send RST for state mismatches during handshake */ if (!(th->th_flags & TH_RST)) pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), 0, TH_RST, 0, 0, (*state)->rule.ptr->return_ttl, 1, 0, kif->pfik_ifp); src->seqlo = 0; src->seqhi = 1; src->max_win = 1; } else if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD state: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)counter_u64_fetch((*state)->packets[0]), (unsigned long long)counter_u64_fetch((*state)->packets[1]), pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pf_pdesc *pd, u_short *reason) { struct tcphdr *th = pd->hdr.tcp; if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) { dst->state = TCPS_FIN_WAIT_2; } else if (src->state == TCPS_SYN_SENT && dst->state < TCPS_SYN_SENT) { /* * Handle a special sloppy case where we only see one * half of the connection. If there is a ACK after * the initial SYN without ever seeing a packet from * the destination, set the connection to established. */ dst->state = src->state = TCPS_ESTABLISHED; if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (src->state == TCPS_CLOSING && dst->state == TCPS_ESTABLISHED && dst->seqlo == 0) { /* * Handle the closing of half connections where we * don't see the full bidirectional FIN/ACK+ACK * handshake. */ dst->state = TCPS_CLOSING; } } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; return (PF_PASS); } static int pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_state_key_cmp key; struct tcphdr *th = pd->hdr.tcp; int copyback = 0; struct pf_state_peer *src, *dst; struct pf_state_key *sk; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = th->th_sport; key.port[1] = th->th_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = th->th_sport; key.port[0] = th->th_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } sk = (*state)->key[pd->didx]; if ((*state)->src.state == PF_TCPS_PROXY_SRC) { if (direction != (*state)->direction) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } if (th->th_flags & TH_SYN) { if (ntohl(th->th_seq) != (*state)->src.seqlo) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, (*state)->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else (*state)->src.state = PF_TCPS_PROXY_DST; } if ((*state)->src.state == PF_TCPS_PROXY_DST) { if (direction == (*state)->direction) { if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } (*state)->src.max_win = MAX(ntohs(th->th_win), 1); if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(arc4random()); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, 0, (*state)->tag, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (((th->th_flags & (TH_SYN|TH_ACK)) != (TH_SYN|TH_ACK)) || (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else { (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); (*state)->dst.seqlo = ntohl(th->th_seq); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ntohl(th->th_seq) + 1, TH_ACK, (*state)->src.max_win, 0, 0, 0, (*state)->tag, NULL); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL); (*state)->src.seqdiff = (*state)->dst.seqhi - (*state)->src.seqlo; (*state)->dst.seqdiff = (*state)->src.seqhi - (*state)->dst.seqlo; (*state)->src.seqhi = (*state)->src.seqlo + (*state)->dst.max_win; (*state)->dst.seqhi = (*state)->dst.seqlo + (*state)->src.max_win; (*state)->src.wscale = (*state)->dst.wscale = 0; (*state)->src.state = (*state)->dst.state = TCPS_ESTABLISHED; REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && dst->state >= TCPS_FIN_WAIT_2 && src->state >= TCPS_FIN_WAIT_2) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state reuse "); pf_print_state(*state); pf_print_flags(th->th_flags); printf("\n"); } /* XXX make sure it's the same direction ?? */ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; pf_unlink_state(*state, PF_ENTER_LOCKED); *state = NULL; return (PF_DROP); } if ((*state)->state_flags & PFSTATE_SLOPPY) { if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) return (PF_DROP); } else { if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, ©back) == PF_DROP) return (PF_DROP); } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != th->th_sport) pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != th->th_dport) pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, pd->af); copyback = 1; } /* Copyback sequence modulation or stateful scrub changes if needed */ if (copyback) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); } static int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; struct udphdr *uh = pd->hdr.udp; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = uh->uh_sport; key.port[1] = uh->uh_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = uh->uh_sport; key.port[0] = uh->uh_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFUDPS_SINGLE) src->state = PFUDPS_SINGLE; if (dst->state == PFUDPS_SINGLE) dst->state = PFUDPS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) (*state)->timeout = PFTM_UDP_MULTIPLE; else (*state)->timeout = PFTM_UDP_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != uh->uh_sport) pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != uh->uh_dport) pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); } return (PF_PASS); } static int pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kkif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0, *icmpsum; u_int8_t icmptype, icmpcode; int state_icmp = 0; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } if (!state_icmp) { /* * ICMP query/reply message not related to a TCP/UDP packet. * Search for an ICMP state. */ key.af = pd->af; key.proto = pd->proto; key.port[0] = key.port[1] = icmpid; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); } STATE_LOOKUP(kif, &key, direction, *state, pd); (*state)->expire = time_uptime; (*state)->timeout = PFTM_ICMP_ERROR_REPLY; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[0] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, nk->port[pd->sidx], 0); pd->hdr.icmp->icmp_id = nk->port[pd->sidx]; } m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); break; #endif /* INET6 */ } } return (PF_PASS); } else { /* * ICMP error message in response to a TCP/UDP packet. * Extract the inner TCP/UDP header and search for that state. */ struct pf_pdesc pd2; bzero(&pd2, sizeof pd2); #ifdef INET struct ip h2; #endif /* INET */ #ifdef INET6 struct ip6_hdr h2_6; int terminal = 0; #endif /* INET6 */ int ipoff2 = 0; int off2 = 0; pd2.af = pd->af; /* Payload packet is from the opposite direction. */ pd2.sidx = (direction == PF_IN) ? 1 : 0; pd2.didx = (direction == PF_IN) ? 0 : 1; switch (pd->af) { #ifdef INET case AF_INET: /* offset of h2 in mbuf chain */ ipoff2 = off + ICMP_MINLEN; if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip)\n")); return (PF_DROP); } /* * ICMP error messages don't refer to non-first * fragments */ if (h2.ip_off & htons(IP_OFFMASK)) { REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* offset of protocol header that follows h2 */ off2 = ipoff2 + (h2.ip_hl << 2); pd2.proto = h2.ip_p; pd2.src = (struct pf_addr *)&h2.ip_src; pd2.dst = (struct pf_addr *)&h2.ip_dst; pd2.ip_sum = &h2.ip_sum; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipoff2 = off + sizeof(struct icmp6_hdr); if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip6)\n")); return (PF_DROP); } pd2.proto = h2_6.ip6_nxt; pd2.src = (struct pf_addr *)&h2_6.ip6_src; pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; pd2.ip_sum = NULL; off2 = ipoff2 + sizeof(h2_6); do { switch (pd2.proto) { case IPPROTO_FRAGMENT: /* * ICMPv6 error messages for * non-first fragments */ REASON_SET(reason, PFRES_FRAG); return (PF_DROP); case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off2, &opt6, sizeof(opt6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMPv6 short opt\n")); return (PF_DROP); } if (pd2.proto == IPPROTO_AH) off2 += (opt6.ip6e_len + 2) * 4; else off2 += (opt6.ip6e_len + 1) * 8; pd2.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); break; #endif /* INET6 */ } if (PF_ANEQ(pd->dst, pd2.src, pd->af)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d outer dst: ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" inner src: "); pf_print_host(pd2.src, 0, pd2.af); printf(" -> "); pf_print_host(pd2.dst, 0, pd2.af); printf("\n"); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; u_int32_t seq; struct pf_state_peer *src, *dst; u_int8_t dws; int copyback = 0; /* * Only the first 8 bytes of the TCP header can be * expected. Don't access any TCP header fields after * th_seq, an ackskew test is not possible. */ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(tcp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_TCP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = th.th_sport; key.port[pd2.didx] = th.th_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->dst; dst = &(*state)->src; } else { src = &(*state)->src; dst = &(*state)->dst; } if (src->wscale && dst->wscale) dws = dst->wscale & PF_WSCALE_MASK; else dws = 0; /* Demodulate sequence number */ seq = ntohl(th.th_seq) - src->seqdiff; if (src->seqdiff) { pf_change_a(&th.th_seq, icmpsum, htonl(seq), 0); copyback = 1; } if (!((*state)->state_flags & PFSTATE_SLOPPY) && (!SEQ_GEQ(src->seqhi, seq) || !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: OK ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != th.th_sport) pf_change_icmp(pd2.src, &th.th_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != th.th_dport) pf_change_icmp(pd2.dst, &th.th_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); copyback = 1; } if (copyback) { switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t )&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, 8, (caddr_t)&th); } return (PF_PASS); break; } case IPPROTO_UDP: { struct udphdr uh; if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(udp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_UDP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = uh.uh_sport; key.port[pd2.didx] = uh.uh_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != uh.uh_sport) pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != uh.uh_dport) pf_change_icmp(pd2.dst, &uh.uh_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); } return (PF_PASS); break; } #ifdef INET case IPPROTO_ICMP: { struct icmp iih; if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short i" "(icmp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp_id) pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp_id) pf_change_icmp(pd2.dst, &iih.icmp_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: { struct icmp6_hdr iih; if (!pf_pull_hdr(m, off2, &iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(icmp6)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMPV6; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp6_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp6_id) pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp6_id) pf_change_icmp(pd2.dst, &iih.icmp6_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); m_copyback(m, off2, sizeof(struct icmp6_hdr), (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET6 */ default: { key.af = pd2.af; key.proto = pd2.proto; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = 0; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af)) pf_change_icmp(pd2.src, NULL, daddr, &nk->addr[pd2.sidx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af)) pf_change_icmp(pd2.dst, NULL, saddr, &nk->addr[pd2.didx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } } return (PF_PASS); break; } } } } static int pf_test_state_other(struct pf_state **state, int direction, struct pfi_kkif *kif, struct mbuf *m, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = key.port[1] = 0; } else { PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = key.port[0] = 0; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFOTHERS_SINGLE) src->state = PFOTHERS_SINGLE; if (dst->state == PFOTHERS_SINGLE) dst->state = PFOTHERS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) (*state)->timeout = PFTM_OTHER_MULTIPLE; else (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; KASSERT(nk, ("%s: nk is null", __func__)); KASSERT(pd, ("%s: pd is null", __func__)); KASSERT(pd->src, ("%s: pd->src is null", __func__)); KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); #endif /* INET6 */ } } return (PF_PASS); } /* * ipoff and off are measured from the start of the mbuf chain. * h must be at "ipoff" on the mbuf chain. */ void * pf_pull_hdr(struct mbuf *m, int off, void *p, int len, u_short *actionp, u_short *reasonp, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; if (fragoff) { if (fragoff >= len) ACTION_SET(actionp, PF_PASS); else { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_FRAG); } return (NULL); } if (m->m_pkthdr.len < off + len || ntohs(h->ip_len) < off + len) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (m->m_pkthdr.len < off + len || (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < (unsigned)(off + len)) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET6 */ } m_copydata(m, off, len, p); return (p); } int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *kif, int rtableid) { struct ifnet *ifp; /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. */ if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6)) return (1); if (af != AF_INET && af != AF_INET6) return (0); /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) return (1); ifp = (kif != NULL) ? kif->pfik_ifp : NULL; switch (af) { #ifdef INET6 case AF_INET6: return (fib6_check_urpf(rtableid, &addr->v6, 0, NHR_NONE, ifp)); #endif #ifdef INET case AF_INET: return (fib4_check_urpf(rtableid, addr->v4, 0, NHR_NONE, ifp)); #endif } return (0); } #ifdef INET static void pf_route(struct mbuf **m, struct pf_krule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0, *m1; struct sockaddr_in dst; struct ip *ip; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_ksrc_node *sn = NULL; int error = 0; uint16_t ip_len, ip_off; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((pd->pf_mtag->flags & PF_DUPLICATED)) { if (s == NULL) { ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == oifp) { /* When the 2nd interface is not skipped */ return; } else { m0 = *m; *m = NULL; goto bad; } } else { pd->pf_mtag->flags |= PF_DUPLICATED; if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) { if (s) PF_STATE_UNLOCK(s); return; } } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip = mtod(m0, struct ip *); bzero(&dst, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); dst.sin_addr = ip->ip_dst; bzero(&naddr, sizeof(naddr)); if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET)) dst.sin_addr.s_addr = naddr.v4.s_addr; ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET)) dst.sin_addr.s_addr = s->rt_addr.v4.s_addr; ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test(PF_OUT, 0, ifp, &m0, inp) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); goto bad; } ip = mtod(m0, struct ip *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* Copied from FreeBSD 10.0-CURRENT ip_output. */ m0->m_pkthdr.csum_flags |= CSUM_IP; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m0, (uint32_t)(ip->ip_hl << 2)); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= ifp->if_mtu || (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } m_clrprotoflags(m0); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; KMOD_IPSTAT_INC(ips_cantfrag); if (r->rt != PF_DUPTO) { icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); goto done; } else goto bad; } error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist); if (error) goto bad; for (; m0; m0 = m1) { m1 = m0->m_nextpkt; m0->m_nextpkt = NULL; if (error == 0) { m_clrprotoflags(m0); error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); } else m_freem(m0); } if (error == 0) KMOD_IPSTAT_INC(ips_fragmented); done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET */ #ifdef INET6 static void pf_route6(struct mbuf **m, struct pf_krule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0; struct sockaddr_in6 dst; struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_ksrc_node *sn = NULL; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((pd->pf_mtag->flags & PF_DUPLICATED)) { if (s == NULL) { ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == oifp) { /* When the 2nd interface is not skipped */ return; } else { m0 = *m; *m = NULL; goto bad; } } else { pd->pf_mtag->flags |= PF_DUPLICATED; if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) { if (s) PF_STATE_UNLOCK(s); return; } } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip6 = mtod(m0, struct ip6_hdr *); bzero(&dst, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(dst); dst.sin6_addr = ip6->ip6_dst; bzero(&naddr, sizeof(naddr)); if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &naddr, AF_INET6); ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &s->rt_addr, AF_INET6); ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (s) PF_STATE_UNLOCK(s); if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, inp) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", __func__)); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & ~ifp->if_hwassist) { uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6); in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr)); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } /* * If the packet is too large for the outgoing interface, * send back an icmp6 error. */ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) nd6_output_ifp(ifp, ifp, m0, &dst, NULL); else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); else goto bad; } done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET6 */ /* * FreeBSD supports cksum offloads for the following drivers. * em(4), fxp(4), lge(4), ndis(4), nge(4), re(4), ti(4), txp(4), xl(4) * * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : * network driver performed cksum including pseudo header, need to verify * csum_data * CSUM_DATA_VALID : * network driver performed cksum, needs to additional pseudo header * cksum computation with partial csum_data(i.e. lack of H/W support for * pseudo header, for instance sk(4) and possibly gem(4)) * * After validating the cksum of packet, set both flag CSUM_DATA_VALID and * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper * TCP/UDP layer. * Also, set csum_data to 0xffff to force cksum validation. */ static int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t sum = 0; int hw_assist = 0; struct ip *ip; if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (p) { case IPPROTO_TCP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_TCP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_UDP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ break; default: return (1); } if (!hw_assist) { switch (af) { case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } } if (sum) { switch (p) { case IPPROTO_TCP: { KMOD_TCPSTAT_INC(tcps_rcvbadsum); break; } case IPPROTO_UDP: { KMOD_UDPSTAT_INC(udps_badsum); break; } #ifdef INET case IPPROTO_ICMP: { KMOD_ICMPSTAT_INC(icps_checksum); break; } #endif #ifdef INET6 case IPPROTO_ICMPV6: { KMOD_ICMP6STAT_INC(icp6s_checksum); break; } #endif /* INET6 */ } return (1); } else { if (p == IPPROTO_TCP || p == IPPROTO_UDP) { m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } return (0); } #ifdef INET int pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kkif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0; struct ip *h = NULL; struct m_tag *ipfwtag; struct pf_krule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_kruleset *ruleset = NULL; struct pf_pdesc pd; int off, dirndx, pqid = 0; PF_RULES_RLOCK_TRACKER; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); kif = (struct pfi_kkif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); pd.pf_mtag = pf_find_mtag(m); PF_RULES_RLOCK(); if (ip_divert_ptr != NULL && ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; goto done; } pd.pf_mtag->flags |= PF_PACKET_LOOPED; m_tag_delete(m, ipfwtag); } if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) { m->m_flags |= M_FASTFWD_OURS; pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT; } } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { /* We do IP header normalization and packet reassembly here */ action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip *); off = h->ip_hl << 2; if (off < (int)sizeof(struct ip)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } pd.src = (struct pf_addr *)&h->ip_src; pd.dst = (struct pf_addr *)&h->ip_dst; pd.sport = pd.dport = NULL; pd.ip_sum = &h->ip_sum; pd.proto_sum = NULL; pd.proto = h->ip_p; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET; pd.tos = h->ip_tos & ~IPTOS_ECN_MASK; pd.tot_len = ntohs(h->ip_len); /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); goto done; } switch (h->ip_p) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { struct icmp ih; pd.hdr.icmp = &ih; if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } #ifdef INET6 case IPPROTO_ICMPV6: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv4 packet with ICMPv6 payload\n")); goto done; } #endif default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (action == PF_PASS && h->ip_hl > 5 && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = r->log; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with ip options\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); if (r->scrub_flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) pqid = 1; if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (pqid || (pd.tos & IPTOS_LOWDELAY)) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ /* * connections redirected to loopback should not match sockets * bound specifically to loopback due to security implications, * see tcp_input() and in_pcblookup_listen(). */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN_LOOPBACK(ntohl(pd.dst->v4.s_addr))) m->m_flags |= M_SKIP_FIREWALL; if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && !PACKET_LOOPED(&pd)) { ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (ipfwtag != NULL) { ((struct ipfw_rule_ref *)(ipfwtag+1))->info = ntohs(r->divert.port); ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; if (s) PF_STATE_UNLOCK(s); m_tag_prepend(m, ipfwtag); if (m->m_flags & M_FASTFWD_OURS) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate tag\n")); } else { pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT; m->m_flags &= ~M_FASTFWD_OURS; } } ip_divert_ptr(*m0, dir == PF_IN); *m0 = NULL; return (action); } else { /* XXX: ipfw has the same behaviour! */ action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate divert tag\n")); } } if (log) { struct pf_krule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } counter_u64_add(kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS], pd.tot_len); counter_u64_add(kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS], 1); if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); counter_u64_add(r->packets[dirndx], 1); counter_u64_add(r->bytes[dirndx], pd.tot_len); if (a != NULL) { counter_u64_add(a->packets[dirndx], 1); counter_u64_add(a->bytes[dirndx], pd.tot_len); } if (s != NULL) { if (s->nat_rule.ptr != NULL) { counter_u64_add(s->nat_rule.ptr->packets[dirndx], 1); counter_u64_add(s->nat_rule.ptr->bytes[dirndx], pd.tot_len); } if (s->src_node != NULL) { counter_u64_add(s->src_node->packets[dirndx], 1); counter_u64_add(s->src_node->bytes[dirndx], pd.tot_len); } if (s->nat_src_node != NULL) { counter_u64_add(s->nat_src_node->packets[dirndx], 1); counter_u64_add(s->nat_src_node->bytes[dirndx], pd.tot_len); } dirndx = (dir == s->direction) ? 0 : 1; counter_u64_add(s->packets[dirndx], 1); counter_u64_add(s->bytes[dirndx], pd.tot_len); } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_OUT)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_IN)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route() returns unlocked. */ if (r->rt) { pf_route(m0, r, dir, kif->pfik_ifp, s, &pd, inp); return (action); } break; } if (s) PF_STATE_UNLOCK(s); return (action); } #endif /* INET */ #ifdef INET6 int pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kkif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0, *n = NULL; struct m_tag *mtag; struct ip6_hdr *h = NULL; struct pf_krule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_kruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0; PF_RULES_RLOCK_TRACKER; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED) return (PF_PASS); kif = (struct pfi_kkif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); PF_RULES_RLOCK(); /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip6_hdr *); /* * we do not support jumbogram. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; pd.sport = pd.dport = NULL; pd.ip_sum = NULL; pd.proto_sum = NULL; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET6; - pd.tos = (ntohl(h->ip6_flow) >> 20) & 0xfc; + pd.tos = IPV6_DSCP(h); pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); pd.proto = h->ip6_nxt; do { switch (pd.proto) { case IPPROTO_FRAGMENT: action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); if (action == PF_DROP) REASON_SET(&reason, PFRES_FRAG); goto done; case IPPROTO_ROUTING: { struct ip6_rthdr rthdr; if (rh_cnt++) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 more than one rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 rthdr0\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } /* FALLTHROUGH */ } case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); action = PF_DROP; log = 1; goto done; } if (pd.proto == IPPROTO_AH) off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); /* if there's no routing header, use unmodified mbuf for checksumming */ if (!n) n = m; switch (pd.proto) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv6 packet with ICMPv4 payload\n")); goto done; } case IPPROTO_ICMPV6: { struct icmp6_hdr ih; pd.hdr.icmp6 = &ih; if (!pf_pull_hdr(m, off, &ih, sizeof(ih), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (n != m) { m_freem(n); n = NULL; } /* handle dangerous IPv6 extension headers. */ if (action == PF_PASS && rh_cnt && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = r->log; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with dangerous v6 headers\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); if (r->scrub_flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) pqid = 1; if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (pd.tos & IPTOS_LOWDELAY) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) m->m_flags |= M_SKIP_FIREWALL; /* XXX: Anybody working on it?! */ if (r->divert.port) printf("pf: divert(9) is not supported for IPv6\n"); if (log) { struct pf_krule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } counter_u64_add(kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS], pd.tot_len); counter_u64_add(kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS], 1); if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); counter_u64_add(r->packets[dirndx], 1); counter_u64_add(r->bytes[dirndx], pd.tot_len); if (a != NULL) { counter_u64_add(a->packets[dirndx], 1); counter_u64_add(a->bytes[dirndx], pd.tot_len); } if (s != NULL) { if (s->nat_rule.ptr != NULL) { counter_u64_add(s->nat_rule.ptr->packets[dirndx], 1); counter_u64_add(s->nat_rule.ptr->bytes[dirndx], pd.tot_len); } if (s->src_node != NULL) { counter_u64_add(s->src_node->packets[dirndx], 1); counter_u64_add(s->src_node->bytes[dirndx], pd.tot_len); } if (s->nat_src_node != NULL) { counter_u64_add(s->nat_src_node->packets[dirndx], 1); counter_u64_add(s->nat_src_node->bytes[dirndx], pd.tot_len); } dirndx = (dir == s->direction) ? 0 : 1; counter_u64_add(s->packets[dirndx], 1); counter_u64_add(s->bytes[dirndx], pd.tot_len); } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]->addr[0], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]->addr[1], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route6() returns unlocked. */ if (r->rt) { pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd, inp); return (action); } break; } if (s) PF_STATE_UNLOCK(s); /* If reassembled packet passed, create new fragments. */ if (action == PF_PASS && *m0 && (pflags & PFIL_FWD) && (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL) action = pf_refragment6(ifp, m0, mtag); return (action); } #endif /* INET6 */