diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index ad1811fc68cd..00d0ac2fa9d5 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1,2582 +1,2581 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include #endif #include -#include #include #include #include VNET_DEFINE_STATIC(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); VNET_DEFINE_STATIC(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); VNET_DEFINE_STATIC(int, functions_inherit_listen_socket_stack) = 1; #define V_functions_inherit_listen_socket_stack \ VNET(functions_inherit_listen_socket_stack) SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(functions_inherit_listen_socket_stack), 0, "Inherit listen socket's stack"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, const struct mbuf *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *, uint16_t); static void syncache_pause(struct in_conninfo *); static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout with default values of * tcp_rexmit_initial * ( 1 + * tcp_backoff[1] + * tcp_backoff[2] + * tcp_backoff[3]) + 3 * tcp_rexmit_slop, * 1000 ms * (1 + 2 + 4 + 8) + 3 * 200 ms = 15600 ms, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 VNET_DEFINE_STATIC(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP SYN cache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); SYSCTL_BOOL(_net_inet_tcp_syncache, OID_AUTO, see_other, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.see_other), 0, "All syncache(4) entries are visible, ignoring UID/GID, jail(2) " "and mac(4) checks"); static int sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS) { int error; u_int new; new = V_tcp_syncache.rexmt_limit; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if (new > TCP_MAXRXTSHIFT) error = EINVAL; else V_tcp_syncache.rexmt_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_syncache.rexmt_limit), 0, sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI", "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; V_tcp_syncache.hashbase[i].sch_last_overflow = -(SYNCOOKIE_LIFETIME + 1); } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); /* Initialize the pause machinery. */ mtx_init(&V_tcp_syncache.pause_mtx, "tcp_sc_pause", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.pause_co, &V_tcp_syncache.pause_mtx, 0); V_tcp_syncache.pause_until = time_uptime - TCP_SYNCACHE_PAUSE_TIME; V_tcp_syncache.pause_backoff = 0; V_tcp_syncache.paused = false; } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* * Stop the re-seed timer before freeing resources. No need to * possibly schedule it another time. */ callout_drain(&V_tcp_syncache.secret.reseed); /* Stop the SYN cache pause callout. */ mtx_lock(&V_tcp_syncache.pause_mtx); if (callout_stop(&V_tcp_syncache.pause_co) == 0) { mtx_unlock(&V_tcp_syncache.pause_mtx); callout_drain(&V_tcp_syncache.pause_co); } else mtx_unlock(&V_tcp_syncache.pause_mtx); /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); mtx_destroy(&V_tcp_syncache.pause_mtx); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); syncache_pause(&sc->sc_inc); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); sch->sch_last_overflow = time_uptime; syncache_drop(sc2, sch); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { int rexmt; if (sc->sc_rxmits == 0) rexmt = tcp_rexmit_initial; else TCPT_RANGESET(rexmt, tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits], tcp_rexmit_min, TCPTV_REXMTMAX); sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; struct epoch_tracker et; int tick = ticks; char *s; bool paused; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; /* * If we have paused processing, unconditionally remove * all syncache entries. */ mtx_lock(&V_tcp_syncache.pause_mtx); paused = V_tcp_syncache.paused; mtx_unlock(&V_tcp_syncache.pause_mtx); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { if (paused) { syncache_drop(sc, sch); continue; } /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_ecn_maxretries) { sc->sc_flags &= ~SCF_ECN_MASK; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } NET_EPOCH_ENTER(et); syncache_respond(sc, NULL, TH_SYN|TH_ACK); NET_EPOCH_EXIT(et); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Returns true if the system is only using cookies at the moment. * This could be due to a sysadmin decision to only use cookies, or it * could be due to the system detecting an attack. */ static inline bool syncache_cookiesonly(void) { return (V_tcp_syncookies && (V_tcp_syncache.paused || V_tcp_syncookiesonly)); } /* * Find the hash bucket for the given connection. */ static struct syncache_head * syncache_hashbucket(struct in_conninfo *inc) { uint32_t hash; /* * The hash is built on foreign port + local port + foreign address. * We rely on the fact that struct in_conninfo starts with 16 bits * of foreign port, then 16 bits of local port then followed by 128 * bits of foreign address. In case of IPv4 address, the first 3 * 32-bit words of the address always are zeroes. */ hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; return (&V_tcp_syncache.hashbase[hash]); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; *schp = sch = syncache_hashbucket(inc); SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, sizeof(struct in_endpoints)) == 0) break; return (sc); /* Always returns with locked sch. */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. * If required send a challenge ACK. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m, uint16_t port) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (tcp_get_flags(th) & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* The remote UDP encaps port does not match. */ if (sc->sc_port != port) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with matching " "syncache entry but non-matching UDP encaps port, " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * RFC 793 page 69: * There are four cases for the acceptability test for an incoming * segment: * * Segment Receive Test * Length Window * ------- ------- ------------------------------------------- * 0 0 SEG.SEQ = RCV.NXT * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND * >0 0 not acceptable * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND * * Note that when receiving a SYN segment in the LISTEN state, * IRS is set to SEG.SEQ and RCV.NXT is set to SEG.SEQ+1, as * described in RFC 793, page 66. */ if ((SEQ_GEQ(th->th_seq, sc->sc_irs + 1) && SEQ_LT(th->th_seq, sc->sc_irs + 1 + sc->sc_wnd)) || (sc->sc_wnd == 0 && th->th_seq == sc->sc_irs + 1)) { if (V_tcp_insecure_rst || th->th_seq == sc->sc_irs + 1) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote " "endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid " " SEQ %u != NXT %u (+WND %u), " "sending challenge ACK\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); syncache_respond(sc, m, TH_ACK); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "NXT %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc, uint16_t port) { struct syncache *sc; struct syncache_head *sch; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if ((sc != NULL) && (sc->sc_port == port)) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq, uint16_t port) { struct syncache *sc; struct syncache_head *sch; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the port != sc_port, then it's a bogus ICMP msg */ if (port != sc->sc_port) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. * * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; NET_EPOCH_ASSERT(); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. */ if ((so = solisten_clone(lso)) == NULL) goto allocfail; #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif error = in_pcballoc(so, &V_tcbinfo); if (error) { sodealloc(so); goto allocfail; } inp = sotoinpcb(so); if ((tp = tcp_newtcpcb(inp)) == NULL) { in_pcbdetach(inp); in_pcbfree(inp); sodealloc(so); goto allocfail; } inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_ip_ttl = sc->sc_ip_ttl; inp->inp_ip_tos = sc->sc_ip_tos; inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); #ifdef NUMA inp->inp_numa_domain = m->m_pkthdr.numa_domain; #endif } inp->inp_lport = sc->sc_inc.inc_lport; #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { struct inpcb *oinp = sotoinpcb(lso); /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); inp->in6p_hops = oinp->in6p_hops; } if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct in6_addr laddr6; struct sockaddr_in6 sin6; sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; INP_HASH_WLOCK(&V_tcbinfo); error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, thread0.td_ucred, m, false); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) { inp->in6p_laddr = laddr6; goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, thread0.td_ucred, false); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) { inp->inp_laddr = laddr; goto abort; } } #endif /* INET */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Copy old policy into new socket's. */ if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0) printf("syncache_socket: could not copy policy\n"); #endif tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tp->t_port = sc->sc_port; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) { /* * Our parents t_fb was not the default, * we need to release our ref on tp->t_fb and * pickup one on the new entry. */ struct tcp_function_block *rblk; rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; /* * XXXrrs this is quite dangerous, it is possible * for the new function to fail to init. We also * are not asking if the handoff_is_ok though at * the very start thats probalbly ok. */ if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } tcp_ecn_syncache_socket(tp, sc); /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_maxunacktime = sototcpcb(lso)->t_maxunacktime; tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); TCPSTAT_INC(tcps_accepts); TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, TCPS_LISTEN); if (!solisten_enqueue(so, SS_ISCONNECTED)) tp->t_flags |= TF_SONOTCONN; return (so); allocfail: /* * Drop the connection; we will either send a RST or have the peer * retransmit its SYN again after its RTO and try again. */ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_listendrop); return (NULL); abort: in_pcbdetach(inp); in_pcbfree(inp); sodealloc(so); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in%s_pcbconnect failed with error %i\n", s, __func__, (sc->sc_inc.inc_flags & INC_ISIPV6) ? "6" : "", error); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_listendrop); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. * * On syncache_socket() success the newly created socket * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m, uint16_t port) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; bool locked; NET_EPOCH_ASSERT(); KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); if (syncache_cookiesonly()) { sc = NULL; sch = syncache_hashbucket(inc); locked = false; } else { sc = syncache_lookup(inc, &sch); /* returns locked sch */ locked = true; SCH_LOCK_ASSERT(sch); } #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop, port); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. Check if syncookies are used in case of syncache * overflows * B. See if this socket has had a syncache entry dropped in * the recent past. We don't want to accept a bogus * syncookie if we've never received a SYN or accept it * twice. * C. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (locked && !V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } if (locked && !V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (no syncache entry)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port); if (locked) SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ if ((to->to_flags & TOF_SIGNATURE) != 0 && (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0)) { /* Drop the ACK. */ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment rejected, " "MD5 signature doesn't match.\n", s, __func__); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_sig_err_sigopt); return (-1); /* Do not send RST */ } #endif /* TCP_SIGNATURE */ TCPSTATES_INC(TCPS_SYN_RECEIVED); } else { if (sc->sc_port != port) { SCH_UNLOCK(sch); return (0); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, check that * received ACK has signature and it is correct. * If not, drop the ACK and leave sc entry in th cache, * because SYN was received with correct signature. */ if (sc->sc_flags & SCF_SIGNATURE) { if ((to->to_flags & TOF_SIGNATURE) == 0) { /* No signature */ TCPSTAT_INC(tcps_sig_err_nosigopt); SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature wasn't " "provided.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) { /* Doesn't match or no SA */ SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature doesn't " "match.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } } #endif /* TCP_SIGNATURE */ /* * RFC 7323 PAWS: If we have a timestamp on this segment and * it's less than ts_recent, drop it. * XXXMT: RFC 7323 also requires to send an ACK. * In tcp_input.c this is only done for TCP segments * with user data, so be consistent here and just drop * the segment. */ if (sc->sc_flags & SCF_TIMESTAMP && to->to_flags & TOF_TS && TSTMP_LT(to->to_tsval, sc->sc_tsreflect)) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: SEG.TSval %u < TS.Recent %u, " "segment dropped\n", s, __func__, to->to_tsval, sc->sc_tsreflect); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not " "expected, segment processed normally\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless the missing timestamps are tolerated. * See section 3.2 of RFC 7323. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if (V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } } TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL) { TCPSTATES_DEC(TCPS_SYN_RECEIVED); if (sc != &scs) syncache_free(sc); } if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } static struct socket * syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m, uint64_t response_cookie) { struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; struct socket *so; NET_EPOCH_ASSERT(); pending_counter = intotcpcb(sotoinpcb(lso))->t_tfo_pending; so = syncache_socket(sc, lso, m); if (so == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { soisconnected(so); inp = sotoinpcb(so); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie.server = response_cookie; tp->snd_max = tp->iss; tp->snd_nxt = tp->iss; tp->t_tfo_pending = pending_counter; TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_completed); } return (so); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. * * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) * cookie is processed and a new socket is created. In this case, any data * accompanying the SYN will be queued to the socket by tcp_input() and will * be ACKed either when the application sends response data or the delayed * ACK timer expires, whichever comes first. */ struct socket * syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod, void *todctx, uint8_t iptos, uint16_t port) { struct tcpcb *tp; struct socket *rv = NULL; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, ip_ttl, ip_tos; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; uint64_t tfo_response_cookie; unsigned int *tfo_pending = NULL; int tfo_cookie_valid = 0; int tfo_response_cookie_valid = 0; bool locked; INP_RLOCK_ASSERT(inp); /* listen socket */ KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { if (inp->inp_flags & IN6P_AUTOFLOWLABEL) { autoflowlabel = 1; } ip_ttl = in6_selecthlim(inp, NULL); if ((inp->in6p_outputopts == NULL) || (inp->in6p_outputopts->ip6po_tclass == -1)) { ip_tos = 0; } else { ip_tos = inp->in6p_outputopts->ip6po_tclass; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; } #endif win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); if (V_tcp_fastopen_server_enable && IS_FASTOPEN(tp->t_flags) && (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { /* * Limit the number of pending TFO connections to * approximately half of the queue limit. This prevents TFO * SYN floods from starving the service by filling the * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, to->to_tfo_cookie, to->to_tfo_len, &tfo_response_cookie); tfo_cookie_valid = (result > 0); tfo_response_cookie_valid = (result >= 0); } /* * Remember the TFO pending counter as it will have to be * decremented below if we don't make it to syncache_tfo_expand(). */ tfo_pending = tp->t_tfo_pending; } #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_RUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif if (!tfo_cookie_valid) INP_RUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * When the socket is TCP-MD5 enabled check that, * - a signed packet is valid * - a non-signed packet does not have a security association * * If a signed packet fails validation or a non-signed packet has a * security association, the packet will be dropped. */ if (ltflags & TF_SIGNATURE) { if (to->to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) goto done; } else { if (TCPMD5_ENABLED() && TCPMD5_INPUT(m, NULL, NULL) != ENOENT) goto done; } } else if (to->to_flags & TOF_SIGNATURE) goto done; #endif /* TCP_SIGNATURE */ /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ if (syncache_cookiesonly()) { sc = NULL; sch = syncache_hashbucket(inc); locked = false; } else { sc = syncache_lookup(inc, &sch); /* returns locked sch */ locked = true; SCH_LOCK_ASSERT(sch); } if (sc != NULL) { if (tfo_cookie_valid) INP_RUNLOCK(inp); TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; /* * Adjust ECN response if needed, e.g. different * IP ECN field, or a fallback by the remote host. */ if (sc->sc_flags & SCF_ECN_MASK) { sc->sc_flags &= ~SCF_ECN_MASK; sc->sc_flags = tcp_ecn_syncache_add(tcp_get_flags(th), iptos); } #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto donenoprobe; } if (tfo_cookie_valid) { bzero(&scs, sizeof(scs)); sc = &scs; goto skip_alloc; } /* * Skip allocating a syncache entry if we are just going to discard * it later. */ if (!locked) { bzero(&scs, sizeof(scs)); sc = &scs; } else sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) { sch->sch_last_overflow = time_uptime; syncache_drop(sc, sch); syncache_pause(inc); } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { KASSERT(locked, ("%s: bucket unexpectedly unlocked", __func__)); SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } skip_alloc: if (!tfo_cookie_valid && tfo_response_cookie_valid) sc->sc_tfo_cookie = &tfo_response_cookie; /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; sc->sc_port = port; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323 && !(ltflags & TF_NOOPT)) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if ((to->to_flags & TOF_TS) && (V_tcp_do_rfc1323 != 2)) { sc->sc_tsreflect = to->to_tsval; sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsoff = tcp_new_ts_offset(inc); } if ((to->to_flags & TOF_SCALE) && (V_tcp_do_rfc1323 != 3)) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatibility problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If incoming packet has an MD5 signature, flag this in the * syncache so that syncache_respond() will do the right thing * with the SYN+ACK. */ if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif /* TCP_SIGNATURE */ if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; /* ECN Handshake */ if (V_tcp_do_ecn) sc->sc_flags |= tcp_ecn_syncache_add(tcp_get_flags(th), iptos); if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); else sc->sc_iss = arc4random(); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif if (locked) SCH_UNLOCK(sch); if (tfo_cookie_valid) { rv = syncache_tfo_expand(sc, so, m, tfo_response_cookie); /* INP_RUNLOCK(inp) will be performed by the caller */ goto tfo_expanded; } TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } goto donenoprobe; done: TCP_PROBE5(receive, NULL, NULL, m, NULL, th); donenoprobe: if (m) m_freem(m); /* * If tfo_pending is not NULL here, then a TFO SYN that did not * result in a new socket was processed and the associated pending * counter has not yet been decremented. All such TFO processing paths * transit this point. */ if (tfo_pending != NULL) tcp_fastopen_decrement_counter(tfo_pending); tfo_expanded: if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif return (rv); } /* * Send SYN|ACK or ACK to the peer. Either in response to a peer's segment, * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; struct udphdr *udp = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt, ulen; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif NET_EPOCH_ASSERT(); hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); if (sc->sc_port) { tlen += sizeof(struct udphdr); } /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_port) mssopt -= V_tcp_udp_tunneling_overhead; mssopt = max(mssopt, V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small: hlen %u, sc_port %u, max_linkhdr %d + " "tlen %d + TCP_MAXOLEN %ju <= MHLEN %d", hlen, sc->sc_port, max_linkhdr, tlen, (uintmax_t)TCP_MAXOLEN, MHLEN)); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ /* Zero out traffic class and flow label. */ ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; ip6->ip6_flow |= sc->sc_flowlabel; if (sc->sc_port != 0) { ip6->ip6_nxt = IPPROTO_UDP; udp = (struct udphdr *)(ip6 + 1); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = sc->sc_port; ulen = (tlen - sizeof(struct ip6_hdr)); th = (struct tcphdr *)(udp + 1); } else { ip6->ip6_nxt = IPPROTO_TCP; th = (struct tcphdr *)(ip6 + 1); } ip6->ip6_flow |= htonl(sc->sc_ip_tos << 20); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); if (sc->sc_port == 0) { ip->ip_p = IPPROTO_TCP; th = (struct tcphdr *)(ip + 1); } else { ip->ip_p = IPPROTO_UDP; udp = (struct udphdr *)(ip + 1); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = sc->sc_port; ulen = (tlen - sizeof(struct ip)); th = (struct tcphdr *)(udp + 1); } } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; if (flags & TH_SYN) th->th_seq = htonl(sc->sc_iss); else th->th_seq = htonl(sc->sc_iss + 1); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; flags = tcp_ecn_syncache_respond(flags, sc); tcp_set_flags(th, flags); /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; if (flags & TH_SYN) { to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif if (sc->sc_tfo_cookie) { to.to_flags |= TOF_FASTOPEN; to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = sc->sc_tfo_cookie; /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } } if (sc->sc_flags & SCF_TIMESTAMP) { to.to_tsval = sc->sc_tsoff + tcp_ts_getticks(); to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) { KASSERT(to.to_flags & TOF_SIGNATURE, ("tcp_addoptions() didn't set tcp_signature")); /* NOTE: to.to_signature is inside of mbuf */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, to.to_signature) != 0) { m_freem(m); return (EACCES); } } #endif } else optlen = 0; if (udp) { ulen += optlen; udp->uh_ulen = htons(ulen); } M_SETFIB(m, sc->sc_inc.inc_fibnum); /* * If we have peer's SYN and it has a flowid, then let's assign it to * our SYN|ACK. ip6_output() and ip_output() will not assign flowid * to SYN|ACK due to lack of inp here. */ if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = m0->m_pkthdr.flowid; M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { if (sc->sc_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); } ip6->ip6_hlim = sc->sc_ip_ttl; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (sc->sc_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); } #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif TCP_PROBE5(send, NULL, NULL, ip, NULL, th); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our initial sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &irs, sizeof(irs)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > sc->sc_peer_mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = nitems(tcp_sc_wstab) - 1; tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = V_tcp_syncache.secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = V_tcp_syncache.secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = V_tcp_syncache.secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = htonl(sc->sc_iss) & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_tsoff = tcp_new_ts_offset(inc); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; sc->sc_port = port; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * We have overflowed a bucket. Let's pause dealing with the syncache. * This function will increment the bucketoverflow statistics appropriately * (once per pause when pausing is enabled; otherwise, once per overflow). */ static void syncache_pause(struct in_conninfo *inc) { time_t delta; const char *s; /* XXX: * 2. Add sysctl read here so we don't get the benefit of this * change without the new sysctl. */ /* * Try an unlocked read. If we already know that another thread * has activated the feature, there is no need to proceed. */ if (V_tcp_syncache.paused) return; /* Are cookied enabled? If not, we can't pause. */ if (!V_tcp_syncookies) { TCPSTAT_INC(tcps_sc_bucketoverflow); return; } /* * We may be the first thread to find an overflow. Get the lock * and evaluate if we need to take action. */ mtx_lock(&V_tcp_syncache.pause_mtx); if (V_tcp_syncache.paused) { mtx_unlock(&V_tcp_syncache.pause_mtx); return; } /* Activate protection. */ V_tcp_syncache.paused = true; TCPSTAT_INC(tcps_sc_bucketoverflow); /* * Determine the last backoff time. If we are seeing a re-newed * attack within that same time after last reactivating the syncache, * consider it an extension of the same attack. */ delta = TCP_SYNCACHE_PAUSE_TIME << V_tcp_syncache.pause_backoff; if (V_tcp_syncache.pause_until + delta - time_uptime > 0) { if (V_tcp_syncache.pause_backoff < TCP_SYNCACHE_MAX_BACKOFF) { delta <<= 1; V_tcp_syncache.pause_backoff++; } } else { delta = TCP_SYNCACHE_PAUSE_TIME; V_tcp_syncache.pause_backoff = 0; } /* Log a warning, including IP addresses, if able. */ if (inc != NULL) s = tcp_log_addrs(inc, NULL, NULL, NULL); else s = (const char *)NULL; log(LOG_WARNING, "TCP syncache overflow detected; using syncookies for " "the next %lld seconds%s%s%s\n", (long long)delta, (s != NULL) ? " (last SYN: " : "", (s != NULL) ? s : "", (s != NULL) ? ")" : ""); free(__DECONST(void *, s), M_TCPLOG); /* Use the calculated delta to set a new pause time. */ V_tcp_syncache.pause_until = time_uptime + delta; callout_reset(&V_tcp_syncache.pause_co, delta * hz, syncache_unpause, &V_tcp_syncache); mtx_unlock(&V_tcp_syncache.pause_mtx); } /* Evaluate whether we need to unpause. */ static void syncache_unpause(void *arg) { struct tcp_syncache *sc; time_t delta; sc = arg; mtx_assert(&sc->pause_mtx, MA_OWNED | MA_NOTRECURSED); callout_deactivate(&sc->pause_co); /* * Check to make sure we are not running early. If the pause * time has expired, then deactivate the protection. */ if ((delta = sc->pause_until - time_uptime) > 0) callout_schedule(&sc->pause_co, delta * hz); else sc->paused = false; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int error, i; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); xt.t_state = TCPS_SYN_RECEIVED; xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket); xt.xt_inp.xi_socket.so_type = SOCK_STREAM; xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING; for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (sc->sc_cred != NULL && cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; xt.xt_encaps_port = sc->sc_port; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); return (0); } } SCH_UNLOCK(sch); } return (0); } diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 011e20692935..19c163e95094 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1,1712 +1,1709 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); VNET_DEFINE(int, udp_log_in_vain) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); VNET_DEFINE(bool, udp_blackhole_local) = false; SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false, "Enforce net.inet.udp.blackhole for locally originated packets"); u_long udp_sendspace = 9216; /* really max datagram size */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); #endif INPCBSTORAGE_DEFINE(udpcbstor, udpcb, "udpinp", "udp_inpcb", "udp", "udphash"); INPCBSTORAGE_DEFINE(udplitecbstor, udpcb, "udpliteinp", "udplite_inpcb", "udplite", "udplitehash"); static void udp_vnet_init(void *arg __unused) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE); /* Additional pcbinfo for UDP-Lite */ in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE, UDBHASHSIZE); } VNET_SYSINIT(udp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_vnet_init, NULL); /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } #ifdef VIMAGE static void udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); in_pcbinfo_destroy(&V_ulitecbinfo); } VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. * * In the normal case udp_append() will return 0, indicating that you * must unlock the inp. However if a tunneling protocol is in place we increment * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we * then decrement the reference count. If the inp_rele returns 1, indicating the * inp is gone, we return that to the caller to tell them *not* to unlock * the inp. In the case of multi-cast this will cause the distribution * to stop (though most tunneling protocols known currently do *not* use * multicast). */ static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *tmpopts, *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; bool filtered; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0], up->u_tun_ctx); INP_RLOCK(inp); if (filtered) return (in_pcbrele_rlocked(inp)); } off += sizeof(struct udphdr); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) { m_freem(n); return (0); } if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */ if (IPSEC_ENABLED(ipv4) && UDPENCAP_INPUT(n, off, AF_INET) != 0) return (0); /* Consumed. */ } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmpopts = sbcreatecontrol(&udp_in[1], sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP, M_NOWAIT); if (tmpopts) { if (opts) { tmpopts->m_next = opts; opts = tmpopts; } else opts = tmpopts; } } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)&udp_in[0]; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { soroverflow_locked(so); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } static bool udp_multi_match(const struct inpcb *inp, void *v) { struct ip *ip = v; struct udphdr *uh = (struct udphdr *)(ip + 1); if (inp->inp_lport != uh->uh_dport) return (false); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) return (false); #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) return (false); if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) return (false); if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) return (false); return (true); } static int udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) { struct ip *ip = mtod(m, struct ip *); struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), INPLOOKUP_RLOCKPCB, udp_multi_match, ip); #ifdef KDTRACE_HOOKS struct udphdr *uh = (struct udphdr *)(ip + 1); #endif struct inpcb *inp; struct mbuf *n; int appends = 0; MPASS(ip->ip_hl == sizeof(struct ip) >> 2); while ((inp = inp_next(&inpi)) != NULL) { /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct ip_moptions *imo; struct sockaddr_in group; int blocked; imo = inp->inp_moptions; if (imo == NULL) continue; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, (struct sockaddr *)&group, (struct sockaddr *)&udp_in[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); continue; } } if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) { break; } else appends++; } /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { INP_RUNLOCK(inp); break; } } if (appends == 0) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) UDPSTAT_INC(udps_noportmcast); else UDPSTAT_INC(udps_noportbcast); } m_freem(m); return (IPPROTO_DONE); } static int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2); udp_in[0].sin_len = sizeof(struct sockaddr_in); udp_in[0].sin_family = AF_INET; udp_in[0].sin_port = uh->uh_sport; udp_in[0].sin_addr = ip->ip_src; udp_in[1].sin_len = sizeof(struct sockaddr_in); udp_in[1].sin_family = AF_INET; udp_in[1].sin_port = uh->uh_dport; udp_in[1].sin_addr = ip->ip_dst; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; bcopy(ipov, b, sizeof(b)); bzero(ipov, sizeof(ipov->ih_x1)); ipov->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ipov, sizeof(b)); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) return (udp_multi_input(m, proto, udp_in)); pcbinfo = udp_get_inpcbinfo(proto); /* * Locate pcb for datagram. * * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (V_udp_log_in_vain) { char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport), inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport)); } if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh); else UDP_PROBE(receive, NULL, NULL, ip, NULL, uh); UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole && (V_udp_blackhole_local || !in_localip(ip->ip_src))) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { INP_WLOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(struct icmp *icmp, struct inpcbinfo *pcbinfo) { struct ip *ip = &icmp->icmp_ip; struct udphdr *uh; struct inpcb *inp; if (icmp_errmap(icmp) == 0) return; uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) udp_notify(inp, icmp_errmap(icmp)); INP_WUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { struct udpcb *up; udp_tun_icmp_t *func; up = intoudpcb(inp); func = up->u_icmp_func; INP_RUNLOCK(inp); if (func != NULL) func(icmp); } } } static void udp_ctlinput(struct icmp *icmp) { return (udp_common_ctlinput(icmp, &V_udbinfo)); } static void udplite_ctlinput(struct icmp *icmp) { return (udp_common_ctlinput(icmp, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != 0) return (EPERM); if (req->oldptr == 0) { int n; n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_udbinfo.ipi_count; xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); error = SYSCTL_OUT(req, &xi, sizeof xi); if (error) { INP_RUNLOCK(inp); break; } } } if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET case UDP_ENCAP: if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(inp, sopt); break; #endif /* INET */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET case UDP_ENCAP: if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(inp, sopt); break; #endif /* INET */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #ifdef INET6 /* The logic here is derived from ip6_setpktopt(). See comments there. */ static int udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src, struct inpcb *inp, int flags) { struct ifnet *ifp; struct in6_pktinfo *pktinfo; struct in_addr ia; if ((flags & PRUS_IPV6) == 0) return (0); if (cm->cmsg_level != IPPROTO_IPV6) return (0); if (cm->cmsg_type != IPV6_2292PKTINFO && cm->cmsg_type != IPV6_PKTINFO) return (0); if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) return (EINVAL); pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm); if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) return (EINVAL); /* Validate the interface index if specified. */ if (pktinfo->ipi6_ifindex) { struct epoch_tracker et; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(pktinfo->ipi6_ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (ENXIO); } else ifp = NULL; if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3]; if (in_ifhasaddr(ifp, ia) == 0) return (EADDRNOTAVAIL); } bzero(src, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); src->sin_port = inp->inp_lport; src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3]; return (0); } #endif /* INET6 */ -pr_send_t udp_send; /* shared with udp6_usrreq.c */ int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct udpiphdr *ui; int len, error = 0; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; struct epoch_tracker et; int cscov_partial = 0; int ipflags = 0; u_short fport, lport; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); if (addr != NULL) { if (addr->sa_family != AF_INET) error = EAFNOSUPPORT; else if (addr->sa_len != sizeof(struct sockaddr_in)) error = EINVAL; if (__predict_false(error != 0)) { m_freem(control); m_freem(m); return (error); } } len = m->m_pkthdr.len; if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; sin = (struct sockaddr_in *)addr; /* * udp_send() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. * * We will need network epoch in either case, to safely lookup into * pcb hash. */ if (sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) INP_WLOCK(inp); else INP_RLOCK(inp); NET_EPOCH_ENTER(et); tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { m_freem(control); error = EINVAL; goto release; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } #ifdef INET6 error = udp_v4mapped_pktinfo(cm, &src, inp, flags); if (error != 0) break; #endif if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); control = NULL; } if (error) goto release; pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } INP_HASH_WLOCK(pcbinfo); error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) { INP_HASH_WUNLOCK(pcbinfo); goto release; } /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; error = in_pcbinshash(inp); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } else INP_HASH_WUNLOCK(pcbinfo); } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_v = IPVERSION << 4; ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); } #if defined(ROUTE_MPATH) || defined(RSS) else if (CALC_FLOWID_OUTBOUND_SENDTO) { uint32_t hash_val, hash_type; hash_val = fib4_calc_packet_hash(laddr, faddr, lport, fport, pr, &hash_type); m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (pr == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); else UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags, inp->inp_moptions, inp); INP_UNLOCK(inp); NET_EPOCH_EXIT(et); return (error); release: INP_UNLOCK(inp); NET_EPOCH_EXIT(et); m_freem(m); return (error); } -pr_abort_t udp_abort; /* shared with udp6_usrreq.c */ void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { static uint32_t udp_flowid; struct inpcbinfo *pcbinfo; struct inpcb *inp; struct udpcb *up; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); error = in_pcballoc(so, pcbinfo); if (error) return (error); inp = sotoinpcb(so); inp->inp_ip_ttl = V_ip_defttl; inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1); inp->inp_flowtype = M_HASHTYPE_OPAQUE; up = intoudpcb(inp); bzero(&up->u_start_zero, u_zero_size); INP_WUNLOCK(inp); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) || (up->u_icmp_func != NULL))) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_icmp_func = i; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sinp; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) { /* * Preserve compatibility with old programs. */ if (nam->sa_family != AF_UNSPEC || nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || sinp->sin_addr.s_addr != INADDR_ANY) return (EAFNOSUPPORT); nam->sa_family = AF_INET; } if (nam->sa_len != sizeof(struct sockaddr_in)) return (EINVAL); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); sin = (struct sockaddr_in *)nam; if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_len != sizeof(*sin)) return (EINVAL); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } NET_EPOCH_ENTER(et); INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred, true); INP_HASH_WUNLOCK(pcbinfo); NET_EPOCH_EXIT(et); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); } -pr_disconnect_t udp_disconnect; /* shared with udp6_usrreq.c */ int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET #define UDP_PROTOSW \ .pr_type = SOCK_DGRAM, \ .pr_flags = PR_ATOMIC | PR_ADDR | PR_CAPATTACH, \ .pr_ctloutput = udp_ctloutput, \ .pr_abort = udp_abort, \ .pr_attach = udp_attach, \ .pr_bind = udp_bind, \ .pr_connect = udp_connect, \ .pr_control = in_control, \ .pr_detach = udp_detach, \ .pr_disconnect = udp_disconnect, \ .pr_peeraddr = in_getpeeraddr, \ .pr_send = udp_send, \ .pr_soreceive = soreceive_dgram, \ .pr_sosend = sosend_dgram, \ .pr_shutdown = udp_shutdown, \ .pr_sockaddr = in_getsockaddr, \ .pr_sosetlabel = in_pcbsosetlabel, \ .pr_close = udp_close struct protosw udp_protosw = { .pr_protocol = IPPROTO_UDP, UDP_PROTOSW }; struct protosw udplite_protosw = { .pr_protocol = IPPROTO_UDPLITE, UDP_PROTOSW }; static void udp_init(void *arg __unused) { IPPROTO_REGISTER(IPPROTO_UDP, udp_input, udp_ctlinput); IPPROTO_REGISTER(IPPROTO_UDPLITE, udp_input, udplite_ctlinput); } SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL); #endif /* INET */ diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index 26e56be53f16..2b91518c0ff3 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -1,181 +1,187 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_UDP_VAR_H_ #define _NETINET_UDP_VAR_H_ #include #include #include /* * UDP kernel structures and variables. */ struct udpiphdr { struct ipovly ui_i; /* overlaid ip structure */ struct udphdr ui_u; /* udp header */ }; #define ui_x1 ui_i.ih_x1 #define ui_v ui_i.ih_x1[0] #define ui_pr ui_i.ih_pr #define ui_len ui_i.ih_len #define ui_src ui_i.ih_src #define ui_dst ui_i.ih_dst #define ui_sport ui_u.uh_sport #define ui_dport ui_u.uh_dport #define ui_ulen ui_u.uh_ulen #define ui_sum ui_u.uh_sum /* * Identifiers for UDP sysctl nodes. */ #define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ #define UDPCTL_STATS 2 /* statistics (read-only) */ #define UDPCTL_MAXDGRAM 3 /* max datagram size */ #define UDPCTL_RECVSPACE 4 /* default receive buffer space */ #define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ /* IPsec: ESP in UDP tunneling: */ #define UF_ESPINUDP_NON_IKE 0x00000001 /* w/ non-IKE marker .. */ /* .. per draft-ietf-ipsec-nat-t-ike-0[01], * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */ #define UF_ESPINUDP 0x00000002 /* w/ non-ESP marker. */ struct udpstat { /* input statistics: */ uint64_t udps_ipackets; /* total input packets */ uint64_t udps_hdrops; /* packet shorter than header */ uint64_t udps_badsum; /* checksum error */ uint64_t udps_nosum; /* no checksum */ uint64_t udps_badlen; /* data length larger than packet */ uint64_t udps_noport; /* no socket on port */ uint64_t udps_noportbcast; /* of above, arrived as broadcast */ uint64_t udps_fullsock; /* not delivered, input socket full */ uint64_t udpps_pcbcachemiss; /* input packets missing pcb cache */ uint64_t udpps_pcbhashmiss; /* input packets not for hashed pcb */ /* output statistics: */ uint64_t udps_opackets; /* total output packets */ uint64_t udps_fastout; /* output packets on fast path */ /* of no socket on port, arrived as multicast */ uint64_t udps_noportmcast; uint64_t udps_filtermcast; /* blocked by multicast filter */ }; #ifdef _KERNEL #include #include struct mbuf; typedef bool udp_tun_func_t(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); typedef union { struct icmp *icmp; struct ip6ctlparam *ip6cp; } udp_tun_icmp_param_t __attribute__((__transparent_union__)); typedef void udp_tun_icmp_t(udp_tun_icmp_param_t); /* * UDP control block; one per udp. */ struct udpcb { struct inpcb u_inpcb; #define u_start_zero u_tun_func #define u_zero_size (sizeof(struct udpcb) - \ offsetof(struct udpcb, u_start_zero)) udp_tun_func_t *u_tun_func; /* UDP kernel tunneling callback. */ udp_tun_icmp_t *u_icmp_func; /* UDP kernel tunneling icmp callback */ u_int u_flags; /* Generic UDP flags. */ uint16_t u_rxcslen; /* Coverage for incoming datagrams. */ uint16_t u_txcslen; /* Coverage for outgoing datagrams. */ void *u_tun_ctx; /* Tunneling callback context. */ }; #define intoudpcb(ip) __containerof((inp), struct udpcb, u_inpcb) #define sotoudpcb(so) (intoudpcb(sotoinpcb(so))) VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define UDPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val)) #define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_udpstat_inc(int statnum); #define KMOD_UDPSTAT_INC(name) \ kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t)) SYSCTL_DECL(_net_inet_udp); VNET_DECLARE(struct inpcbinfo, udbinfo); VNET_DECLARE(struct inpcbinfo, ulitecbinfo); #define V_udbinfo VNET(udbinfo) #define V_ulitecbinfo VNET(ulitecbinfo) extern u_long udp_sendspace; extern u_long udp_recvspace; VNET_DECLARE(int, udp_cksum); VNET_DECLARE(int, udp_blackhole); VNET_DECLARE(bool, udp_blackhole_local); VNET_DECLARE(int, udp_log_in_vain); #define V_udp_cksum VNET(udp_cksum) #define V_udp_blackhole VNET(udp_blackhole) #define V_udp_blackhole_local VNET(udp_blackhole_local) #define V_udp_log_in_vain VNET(udp_log_in_vain) VNET_DECLARE(int, zero_checksum_port); #define V_zero_checksum_port VNET(zero_checksum_port) static __inline struct inpcbinfo * udp_get_inpcbinfo(int protocol) { return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo; } int udp_ctloutput(struct socket *, struct sockopt *); void udplite_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); int udp_shutdown(struct socket *so); int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx); +#ifdef _SYS_PROTOSW_H_ +pr_abort_t udp_abort; +pr_disconnect_t udp_disconnect; +pr_send_t udp_send; +#endif + #endif /* _KERNEL */ #endif /* _NETINET_UDP_VAR_H_ */ diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index 79fe7d4cc0fb..484f43d24808 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -1,1295 +1,1289 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(int, zero_checksum_port) = 0; #define V_zero_checksum_port VNET(zero_checksum_port) SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(zero_checksum_port), 0, "Zero UDP checksum allowed for traffic to/from this port."); - -/* netinet/udp_usrreqs.c */ -pr_abort_t udp_abort; -pr_disconnect_t udp_disconnect; -pr_send_t udp_send; - /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ static void udp6_detach(struct socket *so); static int udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts = NULL, *tmp_opts; struct udpcb *up; bool filtered; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&fromsa[0], up->u_tun_ctx); INP_RLOCK(inp); if (filtered) return (in_pcbrele_rlocked(inp)); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) { m_freem(n); return (0); } } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmp_opts = sbcreatecontrol(&fromsa[1], sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR, IPPROTO_IPV6, M_NOWAIT); if (tmp_opts) { if (opts) { tmp_opts->m_next = opts; opts = tmp_opts; } else opts = tmp_opts; } } m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n, opts) == 0) { soroverflow_locked(so); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } struct udp6_multi_match_ctx { struct ip6_hdr *ip6; struct udphdr *uh; }; static bool udp6_multi_match(const struct inpcb *inp, void *v) { struct udp6_multi_match_ctx *ctx = v; if ((inp->inp_vflag & INP_IPV6) == 0) return(false); if (inp->inp_lport != ctx->uh->uh_dport) return(false); if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport) return(false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst)) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) || inp->inp_fport != ctx->uh->uh_sport)) return (false); return (true); } static int udp6_multi_input(struct mbuf *m, int off, int proto, struct sockaddr_in6 *fromsa) { struct udp6_multi_match_ctx ctx; struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx); struct inpcb *inp; struct ip6_moptions *imo; struct mbuf *n; int appends = 0; /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ ctx.ip6 = mtod(m, struct ip6_hdr *); ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off); while ((inp = inp_next(&inpi)) != NULL) { INP_RLOCK_ASSERT(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ if ((imo = inp->in6p_moptions) != NULL) { struct sockaddr_in6 mcaddr; int blocked; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ctx.ip6->ip6_dst; blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); continue; } } if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ctx.ip6, inp, ctx.uh); else UDP_PROBE(receive, NULL, inp, ctx.ip6, inp, ctx.uh); if (udp6_append(inp, n, off, fromsa)) { break; } else appends++; } /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { INP_RUNLOCK(inp); break; } } m_freem(m); if (appends == 0) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); } return (IPPROTO_DONE); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; struct sockaddr_in6 fromsa[2]; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; NET_EPOCH_ASSERT(); if (m->m_len < off + sizeof(struct udphdr)) { m = m_pullup(m, off + sizeof(struct udphdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = proto; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); /* * dport 0 was rejected earlier so this is OK even if * zero_checksum_port is 0 (which is its default value). */ if (ntohs(uh->uh_dport) == V_zero_checksum_port) goto skip_checksum; else goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } skip_checksum: /* * Construct sockaddr format source address. */ init_sin6(&fromsa[0], m, 0); fromsa[0].sin6_port = uh->uh_sport; init_sin6(&fromsa[1], m, 1); fromsa[1].sin6_port = uh->uh_dport; pcbinfo = udp_get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { *mp = NULL; return (udp6_multi_input(m, off, proto, fromsa)); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (V_udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, NULL, ip6, NULL, uh); else UDP_PROBE(receive, NULL, NULL, ip6, NULL, uh); UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole && (V_udp_blackhole_local || !in6_localaddr(&ip6->ip6_src))) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); *mp = NULL; return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); *mp = NULL; return (IPPROTO_DONE); } } if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip6, inp, uh); else UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, fromsa) == 0) INP_RUNLOCK(inp); *mp = NULL; return (IPPROTO_DONE); badunlocked: m_freem(m); *mp = NULL; return (IPPROTO_DONE); } static void udp6_common_ctlinput(struct ip6ctlparam *ip6cp, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; int errno, off = 0; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if ((errno = icmp6_errmap(ip6cp->ip6c_icmp6)) == 0) return; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); /* Check to see if its tunneled */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst, uh.uh_dport, &ip6->ip6_src, uh.uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp != NULL) { struct udpcb *up; udp_tun_icmp_t *func; up = intoudpcb(inp); func = up->u_icmp_func; INP_RUNLOCK(inp); if (func != NULL) func(ip6cp); } in6_pcbnotify(pcbinfo, ip6cp->ip6c_finaldst, uh.uh_dport, ip6cp->ip6c_src, uh.uh_sport, errno, ip6cp->ip6c_cmdarg, udp_notify); } static void udp6_ctlinput(struct ip6ctlparam *ctl) { return (udp6_common_ctlinput(ctl, &V_udbinfo)); } static void udplite6_ctlinput(struct ip6ctlparam *ctl) { return (udp6_common_ctlinput(ctl, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } NET_EPOCH_ENTER(et); inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_send(struct socket *so, int flags_arg, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct ip6_pktopts *optp, opt; struct sockaddr_in6 *sin6, tmp; struct epoch_tracker et; int cscov_partial, error, flags, hlen, scope_ambiguous; u_int32_t ulen, plen; uint16_t cscov; u_short fport; uint8_t nxt; if (addr6) { error = 0; if (addr6->sa_family != AF_INET6) error = EAFNOSUPPORT; else if (addr6->sa_len != sizeof(struct sockaddr_in6)) error = EINVAL; if (__predict_false(error != 0)) { m_freem(control); m_freem(m); return (error); } } sin6 = (struct sockaddr_in6 *)addr6; /* * In contrast to IPv4 we do not validate the max. packet length * here due to IPv6 Jumbograms (RFC2675). */ scope_ambiguous = 0; if (sin6) { /* Protect *addr6 from overwrites. */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) { if (control) m_freem(control); m_freem(m); return (error); } } inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * In the following cases we want a write lock on the inp for either * local operations or for possible route cache updates in the IPv6 * output path: * - on connected sockets (sin6 is NULL) for route cache updates, * - when we are not bound to an address and source port (it is * in6_pcbsetport() which will require the write lock). * * We check the inp fields before actually locking the inp, so * here exists a race, and we may WLOCK the inp and end with already * bound one by other thread. This is fine. */ if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0)) INP_WLOCK(inp); else INP_RLOCK(inp); nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; if (sin6 == NULL) hasv4addr = (inp->inp_vflag & INP_IPV4); else hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; if (hasv4addr) { /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_UNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); /* addr will just be freed in sendit(). */ return (udp_send(so, flags_arg | PRUS_IPV6, m, (struct sockaddr *)sin6, control, td)); } } else #endif if (sin6 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* * Given this is either an IPv6-only socket or no INET is * supported we will fail the send if the given destination * address is a v4mapped address. * * XXXGL: do we leak m and control? */ INP_UNLOCK(inp); return (EINVAL); } NET_EPOCH_ENTER(et); if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { goto release; } optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } /* * Given we handle the v4mapped case in the INET block above * assert here that it must not happen anymore. */ KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr), ("%s: sin6(%p)->sin6_addr is v4mapped which we " "should have handled.", __func__, sin6)); /* This only requires read-locking. */ error = in6_selectsrc_socket(sin6, optp, inp, td->td_ucred, scope_ambiguous, &in6a, NULL); if (error) goto release; laddr = &in6a; if (inp->inp_lport == 0) { struct inpcbinfo *pcbinfo; INP_WLOCK_ASSERT(inp); pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); INP_HASH_WLOCK(pcbinfo); error = in6_pcbsetport(laddr, inp, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } faddr = &sin6->sin6_addr; fport = sin6->sin6_port; /* allow 0 port */ } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } ulen = m->m_pkthdr.len; plen = sizeof(struct udphdr) + ulen; hlen = sizeof(struct ip6_hdr); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ cscov = cscov_partial = 0; udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } flags = 0; #if defined(ROUTE_MPATH) || defined(RSS) if (CALC_FLOWID_OUTBOUND_SENDTO) { uint32_t hash_type, hash_val; uint8_t pr; pr = inp->inp_socket->so_proto->pr_protocol; hash_val = fib6_calc_packet_hash(laddr, faddr, inp->inp_lport, fport, pr, &hash_type); m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } /* do not use inp flowid */ flags |= IP_NODEFAULTFLOWID; #endif UDPSTAT_INC(udps_opackets); if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); error = ip6_output(m, optp, INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); INP_UNLOCK(inp); NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); release: INP_UNLOCK(inp); NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } m_freem(m); return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); udp_abort(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } error = in_pcballoc(so, pcbinfo); if (error) return (error); inp = (struct inpcb *)so->so_pcb; inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; INP_WUNLOCK(inp); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof(struct sockaddr_in6)) return (EINVAL); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); vflagsav = inp->inp_vflag; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif if (error != 0) inp->inp_vflag = vflagsav; INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); (void)udp_disconnect(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { #ifdef INET struct epoch_tracker et; #endif struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_len != sizeof(*sin6)) return (EINVAL); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; vflagsav = inp->inp_vflag; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred, true); INP_HASH_WUNLOCK(pcbinfo); NET_EPOCH_EXIT(et); /* * If connect succeeds, mark socket as connected. If * connect fails and socket is unbound, reset inp_vflag * field. */ if (error == 0) soisconnected(so); else if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) inp->inp_vflag = vflagsav; goto out; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; vflagsav = inp->inp_vflag; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); /* * If connect succeeds, mark socket as connected. If * connect fails and socket is unbound, reset inp_vflag * field. */ if (error == 0) soisconnected(so); else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) inp->inp_vflag = vflagsav; out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; struct udpcb *up; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); (void)udp_disconnect(so); return (0); } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } #define UDP6_PROTOSW \ .pr_type = SOCK_DGRAM, \ .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, \ .pr_ctloutput = ip6_ctloutput, \ .pr_abort = udp6_abort, \ .pr_attach = udp6_attach, \ .pr_bind = udp6_bind, \ .pr_connect = udp6_connect, \ .pr_control = in6_control, \ .pr_detach = udp6_detach, \ .pr_disconnect = udp6_disconnect, \ .pr_peeraddr = in6_mapped_peeraddr, \ .pr_send = udp6_send, \ .pr_shutdown = udp_shutdown, \ .pr_sockaddr = in6_mapped_sockaddr, \ .pr_soreceive = soreceive_dgram, \ .pr_sosend = sosend_dgram, \ .pr_sosetlabel = in_pcbsosetlabel, \ .pr_close = udp6_close struct protosw udp6_protosw = { .pr_protocol = IPPROTO_UDP, UDP6_PROTOSW }; struct protosw udplite6_protosw = { .pr_protocol = IPPROTO_UDPLITE, UDP6_PROTOSW }; static void udp6_init(void *arg __unused) { IP6PROTO_REGISTER(IPPROTO_UDP, udp6_input, udp6_ctlinput); IP6PROTO_REGISTER(IPPROTO_UDPLITE, udp6_input, udplite6_ctlinput); } SYSINIT(udp6_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp6_init, NULL);