diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 1a4cde642d4a..2b817fee7260 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1,2582 +1,2582 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include VNET_DEFINE_STATIC(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); VNET_DEFINE_STATIC(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); VNET_DEFINE_STATIC(int, functions_inherit_listen_socket_stack) = 1; #define V_functions_inherit_listen_socket_stack \ VNET(functions_inherit_listen_socket_stack) SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(functions_inherit_listen_socket_stack), 0, "Inherit listen socket's stack"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, const struct mbuf *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *, uint16_t); static void syncache_pause(struct in_conninfo *); static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout with default values of * tcp_rexmit_initial * ( 1 + * tcp_backoff[1] + * tcp_backoff[2] + * tcp_backoff[3]) + 3 * tcp_rexmit_slop, * 1000 ms * (1 + 2 + 4 + 8) + 3 * 200 ms = 15600 ms, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 VNET_DEFINE_STATIC(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP SYN cache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); SYSCTL_BOOL(_net_inet_tcp_syncache, OID_AUTO, see_other, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.see_other), 0, "All syncache(4) entries are visible, ignoring UID/GID, jail(2) " "and mac(4) checks"); static int sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS) { int error; u_int new; new = V_tcp_syncache.rexmt_limit; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if (new > TCP_MAXRXTSHIFT) error = EINVAL; else V_tcp_syncache.rexmt_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_syncache.rexmt_limit), 0, sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI", "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; V_tcp_syncache.hashbase[i].sch_last_overflow = -(SYNCOOKIE_LIFETIME + 1); } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); /* Initialize the pause machinery. */ mtx_init(&V_tcp_syncache.pause_mtx, "tcp_sc_pause", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.pause_co, &V_tcp_syncache.pause_mtx, 0); V_tcp_syncache.pause_until = time_uptime - TCP_SYNCACHE_PAUSE_TIME; V_tcp_syncache.pause_backoff = 0; V_tcp_syncache.paused = false; } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* * Stop the re-seed timer before freeing resources. No need to * possibly schedule it another time. */ callout_drain(&V_tcp_syncache.secret.reseed); /* Stop the SYN cache pause callout. */ mtx_lock(&V_tcp_syncache.pause_mtx); if (callout_stop(&V_tcp_syncache.pause_co) == 0) { mtx_unlock(&V_tcp_syncache.pause_mtx); callout_drain(&V_tcp_syncache.pause_co); } else mtx_unlock(&V_tcp_syncache.pause_mtx); /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); mtx_destroy(&V_tcp_syncache.pause_mtx); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); syncache_pause(&sc->sc_inc); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); sch->sch_last_overflow = time_uptime; syncache_drop(sc2, sch); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { int rexmt; if (sc->sc_rxmits == 0) rexmt = tcp_rexmit_initial; else TCPT_RANGESET(rexmt, tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits], tcp_rexmit_min, TCPTV_REXMTMAX); sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; struct epoch_tracker et; int tick = ticks; char *s; bool paused; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; /* * If we have paused processing, unconditionally remove * all syncache entries. */ mtx_lock(&V_tcp_syncache.pause_mtx); paused = V_tcp_syncache.paused; mtx_unlock(&V_tcp_syncache.pause_mtx); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { if (paused) { syncache_drop(sc, sch); continue; } /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_ecn_maxretries) { sc->sc_flags &= ~SCF_ECN_MASK; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } NET_EPOCH_ENTER(et); syncache_respond(sc, NULL, TH_SYN|TH_ACK); NET_EPOCH_EXIT(et); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Returns true if the system is only using cookies at the moment. * This could be due to a sysadmin decision to only use cookies, or it * could be due to the system detecting an attack. */ static inline bool syncache_cookiesonly(void) { return (V_tcp_syncookies && (V_tcp_syncache.paused || V_tcp_syncookiesonly)); } /* * Find the hash bucket for the given connection. */ static struct syncache_head * syncache_hashbucket(struct in_conninfo *inc) { uint32_t hash; /* * The hash is built on foreign port + local port + foreign address. * We rely on the fact that struct in_conninfo starts with 16 bits * of foreign port, then 16 bits of local port then followed by 128 * bits of foreign address. In case of IPv4 address, the first 3 * 32-bit words of the address always are zeroes. */ hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; return (&V_tcp_syncache.hashbase[hash]); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; *schp = sch = syncache_hashbucket(inc); SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, sizeof(struct in_endpoints)) == 0) break; return (sc); /* Always returns with locked sch. */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. * If required send a challenge ACK. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m, uint16_t port) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (tcp_get_flags(th) & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* The remote UDP encaps port does not match. */ if (sc->sc_port != port) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with matching " "syncache entry but non-matching UDP encaps port, " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * RFC 793 page 69: * There are four cases for the acceptability test for an incoming * segment: * * Segment Receive Test * Length Window * ------- ------- ------------------------------------------- * 0 0 SEG.SEQ = RCV.NXT * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND * >0 0 not acceptable * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND * * Note that when receiving a SYN segment in the LISTEN state, * IRS is set to SEG.SEQ and RCV.NXT is set to SEG.SEQ+1, as * described in RFC 793, page 66. */ if ((SEQ_GEQ(th->th_seq, sc->sc_irs + 1) && SEQ_LT(th->th_seq, sc->sc_irs + 1 + sc->sc_wnd)) || (sc->sc_wnd == 0 && th->th_seq == sc->sc_irs + 1)) { if (V_tcp_insecure_rst || th->th_seq == sc->sc_irs + 1) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote " "endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid " " SEQ %u != NXT %u (+WND %u), " "sending challenge ACK\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); syncache_respond(sc, m, TH_ACK); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "NXT %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc, uint16_t port) { struct syncache *sc; struct syncache_head *sch; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if ((sc != NULL) && (sc->sc_port == port)) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq, uint16_t port) { struct syncache *sc; struct syncache_head *sch; if (syncache_cookiesonly()) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the port != sc_port, then it's a bogus ICMP msg */ if (port != sc->sc_port) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. * * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; NET_EPOCH_ASSERT(); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. */ if ((so = solisten_clone(lso)) == NULL) goto allocfail; #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif error = in_pcballoc(so, &V_tcbinfo); if (error) { sodealloc(so); goto allocfail; } inp = sotoinpcb(so); if ((tp = tcp_newtcpcb(inp)) == NULL) { in_pcbdetach(inp); in_pcbfree(inp); sodealloc(so); goto allocfail; } inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_ip_ttl = sc->sc_ip_ttl; inp->inp_ip_tos = sc->sc_ip_tos; inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); #ifdef NUMA inp->inp_numa_domain = m->m_pkthdr.numa_domain; #endif } inp->inp_lport = sc->sc_inc.inc_lport; #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { struct inpcb *oinp = sotoinpcb(lso); /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); inp->in6p_hops = oinp->in6p_hops; } if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct in6_addr laddr6; struct sockaddr_in6 sin6; sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; INP_HASH_WLOCK(&V_tcbinfo); - error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, - thread0.td_ucred, m, false); + error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, + thread0.td_ucred, false); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) { inp->in6p_laddr = laddr6; goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, thread0.td_ucred, false); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) { inp->inp_laddr = laddr; goto abort; } } #endif /* INET */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Copy old policy into new socket's. */ if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0) printf("syncache_socket: could not copy policy\n"); #endif tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tp->t_port = sc->sc_port; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) { /* * Our parents t_fb was not the default, * we need to release our ref on tp->t_fb and * pickup one on the new entry. */ struct tcp_function_block *rblk; rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; /* * XXXrrs this is quite dangerous, it is possible * for the new function to fail to init. We also * are not asking if the handoff_is_ok though at * the very start thats probalbly ok. */ if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } tcp_ecn_syncache_socket(tp, sc); /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_maxunacktime = sototcpcb(lso)->t_maxunacktime; tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); TCPSTAT_INC(tcps_accepts); TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, TCPS_LISTEN); if (!solisten_enqueue(so, SS_ISCONNECTED)) tp->t_flags |= TF_SONOTCONN; return (so); allocfail: /* * Drop the connection; we will either send a RST or have the peer * retransmit its SYN again after its RTO and try again. */ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_listendrop); return (NULL); abort: in_pcbdetach(inp); in_pcbfree(inp); sodealloc(so); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in%s_pcbconnect failed with error %i\n", s, __func__, (sc->sc_inc.inc_flags & INC_ISIPV6) ? "6" : "", error); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_listendrop); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. * * On syncache_socket() success the newly created socket * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m, uint16_t port) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; bool locked; NET_EPOCH_ASSERT(); KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); if (syncache_cookiesonly()) { sc = NULL; sch = syncache_hashbucket(inc); locked = false; } else { sc = syncache_lookup(inc, &sch); /* returns locked sch */ locked = true; SCH_LOCK_ASSERT(sch); } #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop, port); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. Check if syncookies are used in case of syncache * overflows * B. See if this socket has had a syncache entry dropped in * the recent past. We don't want to accept a bogus * syncookie if we've never received a SYN or accept it * twice. * C. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (locked && !V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } if (locked && !V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (no syncache entry)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port); if (locked) SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ if ((to->to_flags & TOF_SIGNATURE) != 0 && (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0)) { /* Drop the ACK. */ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment rejected, " "MD5 signature doesn't match.\n", s, __func__); free(s, M_TCPLOG); } TCPSTAT_INC(tcps_sig_err_sigopt); return (-1); /* Do not send RST */ } #endif /* TCP_SIGNATURE */ TCPSTATES_INC(TCPS_SYN_RECEIVED); } else { if (sc->sc_port != port) { SCH_UNLOCK(sch); return (0); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, check that * received ACK has signature and it is correct. * If not, drop the ACK and leave sc entry in th cache, * because SYN was received with correct signature. */ if (sc->sc_flags & SCF_SIGNATURE) { if ((to->to_flags & TOF_SIGNATURE) == 0) { /* No signature */ TCPSTAT_INC(tcps_sig_err_nosigopt); SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature wasn't " "provided.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) { /* Doesn't match or no SA */ SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment " "rejected, MD5 signature doesn't " "match.\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } } #endif /* TCP_SIGNATURE */ /* * RFC 7323 PAWS: If we have a timestamp on this segment and * it's less than ts_recent, drop it. * XXXMT: RFC 7323 also requires to send an ACK. * In tcp_input.c this is only done for TCP segments * with user data, so be consistent here and just drop * the segment. */ if (sc->sc_flags & SCF_TIMESTAMP && to->to_flags & TOF_TS && TSTMP_LT(to->to_tsval, sc->sc_tsreflect)) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: SEG.TSval %u < TS.Recent %u, " "segment dropped\n", s, __func__, to->to_tsval, sc->sc_tsreflect); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not " "expected, segment processed normally\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless the missing timestamps are tolerated. * See section 3.2 of RFC 7323. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if (V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } return (-1); /* Do not send RST */ } } TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (__predict_false(*lsop == NULL)) { TCPSTAT_INC(tcps_sc_aborted); TCPSTATES_DEC(TCPS_SYN_RECEIVED); } else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL) { TCPSTATES_DEC(TCPS_SYN_RECEIVED); if (sc != &scs) syncache_free(sc); } if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } static struct socket * syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m, uint64_t response_cookie) { struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; struct socket *so; NET_EPOCH_ASSERT(); pending_counter = intotcpcb(sotoinpcb(lso))->t_tfo_pending; so = syncache_socket(sc, lso, m); if (so == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { soisconnected(so); inp = sotoinpcb(so); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie.server = response_cookie; tp->snd_max = tp->iss; tp->snd_nxt = tp->iss; tp->t_tfo_pending = pending_counter; TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_completed); } return (so); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. * * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) * cookie is processed and a new socket is created. In this case, any data * accompanying the SYN will be queued to the socket by tcp_input() and will * be ACKed either when the application sends response data or the delayed * ACK timer expires, whichever comes first. */ struct socket * syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod, void *todctx, uint8_t iptos, uint16_t port) { struct tcpcb *tp; struct socket *rv = NULL; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, ip_ttl, ip_tos; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; uint64_t tfo_response_cookie; unsigned int *tfo_pending = NULL; int tfo_cookie_valid = 0; int tfo_response_cookie_valid = 0; bool locked; INP_RLOCK_ASSERT(inp); /* listen socket */ KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { if (inp->inp_flags & IN6P_AUTOFLOWLABEL) { autoflowlabel = 1; } ip_ttl = in6_selecthlim(inp, NULL); if ((inp->in6p_outputopts == NULL) || (inp->in6p_outputopts->ip6po_tclass == -1)) { ip_tos = 0; } else { ip_tos = inp->in6p_outputopts->ip6po_tclass; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; } #endif win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); if (V_tcp_fastopen_server_enable && IS_FASTOPEN(tp->t_flags) && (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { /* * Limit the number of pending TFO connections to * approximately half of the queue limit. This prevents TFO * SYN floods from starving the service by filling the * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, to->to_tfo_cookie, to->to_tfo_len, &tfo_response_cookie); tfo_cookie_valid = (result > 0); tfo_response_cookie_valid = (result >= 0); } /* * Remember the TFO pending counter as it will have to be * decremented below if we don't make it to syncache_tfo_expand(). */ tfo_pending = tp->t_tfo_pending; } #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_RUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif if (!tfo_cookie_valid) INP_RUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * When the socket is TCP-MD5 enabled check that, * - a signed packet is valid * - a non-signed packet does not have a security association * * If a signed packet fails validation or a non-signed packet has a * security association, the packet will be dropped. */ if (ltflags & TF_SIGNATURE) { if (to->to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to->to_signature) != 0) goto done; } else { if (TCPMD5_ENABLED() && TCPMD5_INPUT(m, NULL, NULL) != ENOENT) goto done; } } else if (to->to_flags & TOF_SIGNATURE) goto done; #endif /* TCP_SIGNATURE */ /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ if (syncache_cookiesonly()) { sc = NULL; sch = syncache_hashbucket(inc); locked = false; } else { sc = syncache_lookup(inc, &sch); /* returns locked sch */ locked = true; SCH_LOCK_ASSERT(sch); } if (sc != NULL) { if (tfo_cookie_valid) INP_RUNLOCK(inp); TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; /* * Adjust ECN response if needed, e.g. different * IP ECN field, or a fallback by the remote host. */ if (sc->sc_flags & SCF_ECN_MASK) { sc->sc_flags &= ~SCF_ECN_MASK; sc->sc_flags = tcp_ecn_syncache_add(tcp_get_flags(th), iptos); } #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto donenoprobe; } if (tfo_cookie_valid) { bzero(&scs, sizeof(scs)); sc = &scs; goto skip_alloc; } /* * Skip allocating a syncache entry if we are just going to discard * it later. */ if (!locked) { bzero(&scs, sizeof(scs)); sc = &scs; } else sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) { sch->sch_last_overflow = time_uptime; syncache_drop(sc, sch); syncache_pause(inc); } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { KASSERT(locked, ("%s: bucket unexpectedly unlocked", __func__)); SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } skip_alloc: if (!tfo_cookie_valid && tfo_response_cookie_valid) sc->sc_tfo_cookie = &tfo_response_cookie; /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; sc->sc_port = port; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323 && !(ltflags & TF_NOOPT)) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if ((to->to_flags & TOF_TS) && (V_tcp_do_rfc1323 != 2)) { sc->sc_tsreflect = to->to_tsval; sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsoff = tcp_new_ts_offset(inc); } if ((to->to_flags & TOF_SCALE) && (V_tcp_do_rfc1323 != 3)) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatibility problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If incoming packet has an MD5 signature, flag this in the * syncache so that syncache_respond() will do the right thing * with the SYN+ACK. */ if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif /* TCP_SIGNATURE */ if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; /* ECN Handshake */ if (V_tcp_do_ecn) sc->sc_flags |= tcp_ecn_syncache_add(tcp_get_flags(th), iptos); if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); else sc->sc_iss = arc4random(); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif if (locked) SCH_UNLOCK(sch); if (tfo_cookie_valid) { rv = syncache_tfo_expand(sc, so, m, tfo_response_cookie); /* INP_RUNLOCK(inp) will be performed by the caller */ goto tfo_expanded; } TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } goto donenoprobe; done: TCP_PROBE5(receive, NULL, NULL, m, NULL, th); donenoprobe: if (m) m_freem(m); /* * If tfo_pending is not NULL here, then a TFO SYN that did not * result in a new socket was processed and the associated pending * counter has not yet been decremented. All such TFO processing paths * transit this point. */ if (tfo_pending != NULL) tcp_fastopen_decrement_counter(tfo_pending); tfo_expanded: if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif return (rv); } /* * Send SYN|ACK or ACK to the peer. Either in response to a peer's segment, * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; struct udphdr *udp = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt, ulen; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif NET_EPOCH_ASSERT(); hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); if (sc->sc_port) { tlen += sizeof(struct udphdr); } /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_port) mssopt -= V_tcp_udp_tunneling_overhead; mssopt = max(mssopt, V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small: hlen %u, sc_port %u, max_linkhdr %d + " "tlen %d + TCP_MAXOLEN %ju <= MHLEN %d", hlen, sc->sc_port, max_linkhdr, tlen, (uintmax_t)TCP_MAXOLEN, MHLEN)); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ /* Zero out traffic class and flow label. */ ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; ip6->ip6_flow |= sc->sc_flowlabel; if (sc->sc_port != 0) { ip6->ip6_nxt = IPPROTO_UDP; udp = (struct udphdr *)(ip6 + 1); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = sc->sc_port; ulen = (tlen - sizeof(struct ip6_hdr)); th = (struct tcphdr *)(udp + 1); } else { ip6->ip6_nxt = IPPROTO_TCP; th = (struct tcphdr *)(ip6 + 1); } ip6->ip6_flow |= htonl(sc->sc_ip_tos << 20); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); if (sc->sc_port == 0) { ip->ip_p = IPPROTO_TCP; th = (struct tcphdr *)(ip + 1); } else { ip->ip_p = IPPROTO_UDP; udp = (struct udphdr *)(ip + 1); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = sc->sc_port; ulen = (tlen - sizeof(struct ip)); th = (struct tcphdr *)(udp + 1); } } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; if (flags & TH_SYN) th->th_seq = htonl(sc->sc_iss); else th->th_seq = htonl(sc->sc_iss + 1); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; flags = tcp_ecn_syncache_respond(flags, sc); tcp_set_flags(th, flags); /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; if (flags & TH_SYN) { to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif if (sc->sc_tfo_cookie) { to.to_flags |= TOF_FASTOPEN; to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = sc->sc_tfo_cookie; /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } } if (sc->sc_flags & SCF_TIMESTAMP) { to.to_tsval = sc->sc_tsoff + tcp_ts_getticks(); to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) { KASSERT(to.to_flags & TOF_SIGNATURE, ("tcp_addoptions() didn't set tcp_signature")); /* NOTE: to.to_signature is inside of mbuf */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, to.to_signature) != 0) { m_freem(m); return (EACCES); } } #endif } else optlen = 0; if (udp) { ulen += optlen; udp->uh_ulen = htons(ulen); } M_SETFIB(m, sc->sc_inc.inc_fibnum); /* * If we have peer's SYN and it has a flowid, then let's assign it to * our SYN|ACK. ip6_output() and ip_output() will not assign flowid * to SYN|ACK due to lack of inp here. */ if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = m0->m_pkthdr.flowid; M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { if (sc->sc_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); } ip6->ip6_hlim = sc->sc_ip_ttl; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (sc->sc_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); } #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif TCP_PROBE5(send, NULL, NULL, ip, NULL, th); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our initial sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &irs, sizeof(irs)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > sc->sc_peer_mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = nitems(tcp_sc_wstab) - 1; tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = V_tcp_syncache.secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = V_tcp_syncache.secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = V_tcp_syncache.secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = htonl(sc->sc_iss) & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_tsoff = tcp_new_ts_offset(inc); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; sc->sc_port = port; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * We have overflowed a bucket. Let's pause dealing with the syncache. * This function will increment the bucketoverflow statistics appropriately * (once per pause when pausing is enabled; otherwise, once per overflow). */ static void syncache_pause(struct in_conninfo *inc) { time_t delta; const char *s; /* XXX: * 2. Add sysctl read here so we don't get the benefit of this * change without the new sysctl. */ /* * Try an unlocked read. If we already know that another thread * has activated the feature, there is no need to proceed. */ if (V_tcp_syncache.paused) return; /* Are cookied enabled? If not, we can't pause. */ if (!V_tcp_syncookies) { TCPSTAT_INC(tcps_sc_bucketoverflow); return; } /* * We may be the first thread to find an overflow. Get the lock * and evaluate if we need to take action. */ mtx_lock(&V_tcp_syncache.pause_mtx); if (V_tcp_syncache.paused) { mtx_unlock(&V_tcp_syncache.pause_mtx); return; } /* Activate protection. */ V_tcp_syncache.paused = true; TCPSTAT_INC(tcps_sc_bucketoverflow); /* * Determine the last backoff time. If we are seeing a re-newed * attack within that same time after last reactivating the syncache, * consider it an extension of the same attack. */ delta = TCP_SYNCACHE_PAUSE_TIME << V_tcp_syncache.pause_backoff; if (V_tcp_syncache.pause_until + delta - time_uptime > 0) { if (V_tcp_syncache.pause_backoff < TCP_SYNCACHE_MAX_BACKOFF) { delta <<= 1; V_tcp_syncache.pause_backoff++; } } else { delta = TCP_SYNCACHE_PAUSE_TIME; V_tcp_syncache.pause_backoff = 0; } /* Log a warning, including IP addresses, if able. */ if (inc != NULL) s = tcp_log_addrs(inc, NULL, NULL, NULL); else s = (const char *)NULL; log(LOG_WARNING, "TCP syncache overflow detected; using syncookies for " "the next %lld seconds%s%s%s\n", (long long)delta, (s != NULL) ? " (last SYN: " : "", (s != NULL) ? s : "", (s != NULL) ? ")" : ""); free(__DECONST(void *, s), M_TCPLOG); /* Use the calculated delta to set a new pause time. */ V_tcp_syncache.pause_until = time_uptime + delta; callout_reset(&V_tcp_syncache.pause_co, delta * hz, syncache_unpause, &V_tcp_syncache); mtx_unlock(&V_tcp_syncache.pause_mtx); } /* Evaluate whether we need to unpause. */ static void syncache_unpause(void *arg) { struct tcp_syncache *sc; time_t delta; sc = arg; mtx_assert(&sc->pause_mtx, MA_OWNED | MA_NOTRECURSED); callout_deactivate(&sc->pause_co); /* * Check to make sure we are not running early. If the pause * time has expired, then deactivate the protection. */ if ((delta = sc->pause_until - time_uptime) > 0) callout_schedule(&sc->pause_co, delta * hz); else sc->paused = false; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int error, i; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); xt.t_state = TCPS_SYN_RECEIVED; xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket); xt.xt_inp.xi_socket.so_type = SOCK_STREAM; xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING; for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (sc->sc_cred != NULL && cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; xt.xt_encaps_port = sc->sc_port; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); return (0); } } SCH_UNLOCK(sch); } return (0); } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 8cc5894171de..5d82e5138538 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,3098 +1,3098 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include #include /* * TCP protocol interface to socket abstraction. */ #ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); static int tcp_pru_options_support(struct tcpcb *tp, int flags); /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) goto out; so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; error = in_pcballoc(so, &V_tcbinfo); if (error) goto out; inp = sotoinpcb(so); tp = tcp_newtcpcb(inp); if (tp == NULL) { error = ENOBUFS; in_pcbdetach(inp); in_pcbfree(inp); goto out; } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); out: TCP_PROBE2(debug__user, tp, PRU_ATTACH); return (error); } /* * tcp_usr_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); KASSERT(so->so_pcb == inp && inp->inp_socket == so, ("%s: socket %p inp %p mismatch", __func__, so, inp)); tp = intotcpcb(inp); KASSERT(inp->inp_flags & INP_DROPPED || tp->t_state < TCPS_SYN_SENT, ("%s: inp %p not dropped or embryonic", __func__, inp)); tcp_discardcb(tp); in_pcbdetach(inp); in_pcbfree(inp); } #ifdef INET /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; #ifdef KDTRACE_HOOKS struct tcpcb *tp = NULL; #endif struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) { /* * Preserve compatibility with old programs. */ if (nam->sa_family != AF_UNSPEC || nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || sinp->sin_addr.s_addr != INADDR_ANY) return (EAFNOSUPPORT); nam->sa_family = AF_INET; } if (nam->sa_len != sizeof(*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = EINVAL; goto out; } #ifdef KDTRACE_HOOKS tp = intotcpcb(inp); #endif INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; #ifdef KDTRACE_HOOKS struct tcpcb *tp = NULL; #endif struct sockaddr_in6 *sin6; u_char vflagsav; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; if (inp->inp_flags & INP_DROPPED) { error = EINVAL; goto out; } #ifdef KDTRACE_HOOKS tp = intotcpcb(inp); #endif INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: if (error != 0) inp->inp_vflag = vflagsav; TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = EINVAL; goto out; } tp = intotcpcb(inp); SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); goto out; } if (inp->inp_lport == 0) { INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, NULL, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); } if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } else { solisten_proto_abort(so); } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; u_char vflagsav; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = EINVAL; goto out; } vflagsav = inp->inp_vflag; tp = intotcpcb(inp); SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); goto out; } INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, NULL, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } else { solisten_proto_abort(so); } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); if (error != 0) inp->inp_vflag = vflagsav; out: TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) return (EAFNOSUPPORT); if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) return (EACCES); if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } if (SOLISTENING(so)) { error = EOPNOTSUPP; goto out; } tp = intotcpcb(inp); NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, nam, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tcp_output(tp); KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); out_in_epoch: NET_EPOCH_EXIT(et); out: TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6; u_int8_t incflagsav; u_char vflagsav; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } if (SOLISTENING(so)) { error = EINVAL; goto out; } tp = intotcpcb(inp); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) { error = EACCES; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif error = tcp_output(tp); goto out_in_epoch; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) goto out; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; NET_EPOCH_ENTER(et); if ((error = tcp6_connect(tp, nam, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tcp_output(tp); out_in_epoch: NET_EPOCH_EXIT(et); out: KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); /* * If the implicit bind in the connect call fails, restore * the flags we modified. */ if (error != 0 && inp->inp_lport == 0) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; int error = 0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); tcp_disconnect(tp); out: TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); return (error); } #ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; #ifdef KDTRACE_HOOKS struct tcpcb *tp = NULL; #endif struct in_addr addr; in_port_t port = 0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = ECONNABORTED; goto out; } #ifdef KDTRACE_HOOKS tp = intotcpcb(inp); #endif /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #endif /* INET */ #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; #ifdef KDTRACE_HOOKS struct tcpcb *tp = NULL; #endif struct in_addr addr; struct in6_addr addr6; struct epoch_tracker et; in_port_t port = 0; int v4 = 0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = ECONNABORTED; goto out; } #ifdef KDTRACE_HOOKS tp = intotcpcb(inp); #endif /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); NET_EPOCH_ENTER(et); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) error = tcp_output_nodrop(tp); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp = NULL; int outrv = 0, error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); NET_EPOCH_ENTER(et); /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early * SYN|ACK. It is preferable to have the SYN|ACK be sent along with * application response data, or failing that, when the DELACK timer * expires. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif outrv = tcp_output_nodrop(tp); out: TCP_PROBE2(debug__user, tp, PRU_RCVD); (void) tcp_unlock_or_drop(tp, outrv); NET_EPOCH_EXIT(et); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; #ifdef INET #ifdef INET6 struct sockaddr_in sin; #endif struct sockaddr_in *sinp; #endif #ifdef INET6 int isipv6; #endif u_int8_t incflagsav; u_char vflagsav; bool restoreflags; if (control != NULL) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); return (EINVAL); } m_freem(control); /* empty control, just free it */ } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); INP_WUNLOCK(inp); return (ECONNRESET); } vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; restoreflags = false; tp = intotcpcb(inp); NET_EPOCH_ENTER(et); if ((flags & PRUS_OOB) != 0 && (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0) goto out; if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { if (tp->t_state == TCPS_LISTEN) { error = EINVAL; goto out; } switch (nam->sa_family) { #ifdef INET case AF_INET: sinp = (struct sockaddr_in *)nam; if (sinp->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV6) != 0) { error = EAFNOSUPPORT; goto out; } if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) { error = EACCES; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) goto out; #ifdef INET6 isipv6 = 0; #endif break; #endif /* INET */ #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_len != sizeof(*sin6)) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV6PROTO) == 0) { error = EAFNOSUPPORT; goto out; } if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { error = EAFNOSUPPORT; goto out; } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV6; sinp = &sin; in6_sin6_2_sin(sinp, sin6); if (IN_MULTICAST( ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) goto out; isipv6 = 0; #else /* !INET */ error = EAFNOSUPPORT; goto out; #endif /* INET */ } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV4; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr))) goto out; isipv6 = 1; } break; } #endif /* INET6 */ default: error = EAFNOSUPPORT; goto out; } } if (!(flags & PRUS_OOB)) { if (tp->t_acktime == 0) tp->t_acktime = ticks; sbappendstream(&so->so_snd, m, flags); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { KASSERT(tp->t_state == TCPS_CLOSED, ("%s: tp %p is listening", __func__, tp)); /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error) { /* m is freed if PRUS_NOTREADY is unset. */ sbflush(&so->so_snd); goto out; } if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_connect(tp); else { tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); tcp_usrclosed(tp); } if (TCPS_HAVEESTABLISHED(tp->t_state) && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_out == 0) && (so->so_snd.sb_ccc > 0)) { tp->t_fbyte_out = ticks; if (tp->t_fbyte_out == 0) tp->t_fbyte_out = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } if (!(inp->inp_flags & INP_DROPPED) && !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output_nodrop(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ if (tp->t_acktime == 0) tp->t_acktime = ticks; sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ /* * Not going to contemplate SYN|URG */ if (IS_FASTOPEN(tp->t_flags)) tp->t_flags &= ~TF_FASTOPEN; #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error != 0) { /* m is freed if PRUS_NOTREADY is unset. */ sbflush(&so->so_snd); goto out; } tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if ((flags & PRUS_NOTREADY) == 0) { tp->t_flags |= TF_FORCEDATA; error = tcp_output_nodrop(tp); tp->t_flags &= ~TF_FORCEDATA; } } TCP_LOG_EVENT(tp, NULL, &inp->inp_socket->so_rcv, &inp->inp_socket->so_snd, TCP_LOG_USERSEND, error, 0, NULL, false); out: /* * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is * responsible for freeing memory. */ if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); /* * If the request was unsuccessful and we changed flags, * restore the original flags. */ if (error != 0 && restoreflags) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } static int tcp_usr_ready(struct socket *so, struct mbuf *m, int count) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp; int error; inp = sotoinpcb(so); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); mb_free_notready(m, count); return (ECONNRESET); } tp = intotcpcb(inp); SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error) { INP_WUNLOCK(inp); return (error); } NET_EPOCH_ENTER(et); error = tcp_output_unlock(tp); NET_EPOCH_EXIT(et); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp == NULL) goto dropped; TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { soref(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); dropped: NET_EPOCH_EXIT(et); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); tp->t_flags |= TF_CLOSED; tcp_disconnect(tp); TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { soref(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); } static int tcp_pru_options_support(struct tcpcb *tp, int flags) { /* * If the specific TCP stack has a pru_options * specified then it does not always support * all the PRU_XX options and we must ask it. * If the function is not specified then all * of the PRU_XX options are supported. */ int ret = 0; if (tp->t_fb->tfb_pru_options) { ret = (*tp->t_fb->tfb_pru_options)(tp, flags); } return (ret); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); error = tcp_pru_options_support(tp, PRUS_OOB); if (error) { goto out; } if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET struct protosw tcp_protosw = { .pr_type = SOCK_STREAM, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD | PR_CAPATTACH, .pr_ctloutput = tcp_ctloutput, .pr_abort = tcp_usr_abort, .pr_accept = tcp_usr_accept, .pr_attach = tcp_usr_attach, .pr_bind = tcp_usr_bind, .pr_connect = tcp_usr_connect, .pr_control = in_control, .pr_detach = tcp_usr_detach, .pr_disconnect = tcp_usr_disconnect, .pr_listen = tcp_usr_listen, .pr_peeraddr = in_getpeeraddr, .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in_getsockaddr, .pr_sosetlabel = in_pcbsosetlabel, .pr_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 struct protosw tcp6_protosw = { .pr_type = SOCK_STREAM, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD | PR_CAPATTACH, .pr_ctloutput = tcp_ctloutput, .pr_abort = tcp_usr_abort, .pr_accept = tcp6_usr_accept, .pr_attach = tcp_usr_attach, .pr_bind = tcp6_usr_bind, .pr_connect = tcp6_usr_connect, .pr_control = in6_control, .pr_detach = tcp_usr_detach, .pr_disconnect = tcp_usr_disconnect, .pr_listen = tcp6_usr_listen, .pr_peeraddr = in6_mapped_peeraddr, .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in6_mapped_sockaddr, .pr_sosetlabel = in_pcbsosetlabel, .pr_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tptoinpcb(tp), *oinp; struct socket *so = tptosocket(tp); struct in_addr laddr; u_short lport; int error; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) goto out; if (oinp) { error = EADDRINUSE; goto out; } /* Handle initial bind if it hadn't been done in advance. */ if (inp->inp_lport == 0) { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto out; } } inp->inp_laddr = laddr; in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tptoinpcb(tp); int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); - error = in6_pcbconnect(inp, nam, td->td_ucred); + error = in6_pcbconnect(inp, nam, td->td_ucred, true); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) return (error); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return (0); } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { INP_WLOCK_ASSERT(tptoinpcb(tp)); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ti->tcpi_options |= TCPI_OPT_ECN; ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; tcp_offload_tcp_info(tp, ti); } #endif /* * AccECN related counters. */ if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) == (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) /* * Internal counter starts at 5 for AccECN * but 0 for RFC3168 ECN. */ ti->tcpi_delivered_ce = tp->t_scep - 5; else ti->tcpi_delivered_ce = tp->t_scep; ti->tcpi_received_ce = tp->t_rcep; } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & INP_DROPPED) { \ INP_WUNLOCK(inp); \ cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) { struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); int error = 0; MPASS(sopt->sopt_dir == SOPT_SET); INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("inp_flags == %x", inp->inp_flags)); KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) error = ip6_ctloutput(so, sopt); #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET error = ip_ctloutput(so, sopt); #endif /* * When an IP-level socket option affects TCP, pass control * down to stack tfb_tcp_ctloutput, otherwise return what * IP level returned. */ switch (sopt->sopt_level) { #ifdef INET6 case IPPROTO_IPV6: if ((inp->inp_vflag & INP_IPV6PROTO) == 0) return (error); switch (sopt->sopt_name) { case IPV6_TCLASS: /* Notify tcp stacks that care (e.g. RACK). */ break; case IPV6_USE_MIN_MTU: /* Update t_maxseg accordingly. */ break; default: return (error); } break; #endif #ifdef INET case IPPROTO_IP: switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos &= ~IPTOS_ECN_MASK; break; case IP_TTL: /* Notify tcp stacks that care (e.g. RACK). */ break; default: return (error); } break; #endif default: return (error); } INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { /* * Protect the TCP option TCP_FUNCTION_BLK so * that a sub-function can *never* overwrite this. */ struct tcp_function_set fsn; struct tcp_function_block *blk; INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); return (ENOENT); } if (tp->t_fb == blk) { /* You already have this */ refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (0); } if (tp->t_state != TCPS_CLOSED) { /* * The user has advanced the state * past the initial point, we may not * be able to switch. */ if (blk->tfb_tcp_handoff_ok != NULL) { /* * Does the stack provide a * query mechanism, if so it may * still be possible? */ error = (*blk->tfb_tcp_handoff_ok)(tp); } else error = EINVAL; if (error) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return(error); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } /* * Release the old refcnt, the * lookup acquired a ref on the * new one already. */ if (tp->t_fb->tfb_tcp_fb_fini) { struct epoch_tracker et; /* * Tell the stack to cleanup with 0 i.e. * the tcb is not going away. */ NET_EPOCH_ENTER(et); (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); NET_EPOCH_EXIT(et); } #ifdef TCPHPTS /* Assure that we are not on any hpts */ tcp_hpts_remove(tptoinpcb(tp)); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp); if (error) { refcount_release(&blk->tfb_refcnt); if (tp->t_fb->tfb_tcp_fb_init) { if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { /* Fall back failed, drop the connection */ INP_WUNLOCK(inp); soabort(so); return (error); } } goto err_out; } } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif err_out: INP_WUNLOCK(inp); return (error); } /* Pass in the INP locked, callee must unlock it. */ return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt)); } static int tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) { struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); int error = 0; MPASS(sopt->sopt_dir == SOPT_GET); INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("inp_flags == %x", inp->inp_flags)); KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) error = ip6_ctloutput(so, sopt); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = ip_ctloutput(so, sopt); #endif return (error); } if (((sopt->sopt_name == TCP_FUNCTION_BLK) || (sopt->sopt_name == TCP_FUNCTION_ALIAS))) { struct tcp_function_set fsn; if (sopt->sopt_name == TCP_FUNCTION_ALIAS) { memset(&fsn, 0, sizeof(fsn)); find_tcp_function_alias(tp->t_fb, &fsn); } else { strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } /* Pass in the INP locked, callee must unlock it. */ return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt)); } int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } if (sopt->sopt_dir == SOPT_SET) return (tcp_ctloutput_set(inp, sopt)); else if (sopt->sopt_dir == SOPT_GET) return (tcp_ctloutput_get(inp, sopt)); else panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); } /* * If this assert becomes untrue, we need to change the size of the buf * variable in tcp_default_ctloutput(). */ #ifdef CTASSERT CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif #ifdef KERN_TLS static int copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) { struct tls_enable_v0 tls_v0; int error; if (sopt->sopt_valsize == sizeof(tls_v0)) { error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0), sizeof(tls_v0)); if (error) return (error); memset(tls, 0, sizeof(*tls)); tls->cipher_key = tls_v0.cipher_key; tls->iv = tls_v0.iv; tls->auth_key = tls_v0.auth_key; tls->cipher_algorithm = tls_v0.cipher_algorithm; tls->cipher_key_len = tls_v0.cipher_key_len; tls->iv_len = tls_v0.iv_len; tls->auth_algorithm = tls_v0.auth_algorithm; tls->auth_key_len = tls_v0.auth_key_len; tls->flags = tls_v0.flags; tls->tls_vmajor = tls_v0.tls_vmajor; tls->tls_vminor = tls_v0.tls_vminor; return (0); } return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls))); } #endif extern struct cc_algo newreno_cc_algo; static int tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) { struct cc_algo *algo; void *ptr = NULL; struct tcpcb *tp; struct cc_var cc_mem; char buf[TCP_CA_NAME_MAX]; size_t mem_sz; int error; INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) return(error); buf[sopt->sopt_valsize] = '\0'; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) { if (algo->flags & CC_MODULE_BEING_REMOVED) { /* We can't "see" modules being unloaded */ continue; } break; } } if (algo == NULL) { CC_LIST_RUNLOCK(); return(ESRCH); } /* * With a reference the algorithm cannot be removed * so we hold a reference through the change process. */ cc_refer(algo); CC_LIST_RUNLOCK(); if (algo->cb_init != NULL) { /* We can now pre-get the memory for the CC */ mem_sz = (*algo->cc_data_sz)(); if (mem_sz == 0) { goto no_mem_needed; } ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); } else { no_mem_needed: mem_sz = 0; ptr = NULL; } /* * Make sure its all clean and zero and also get * back the inplock. */ memset(&cc_mem, 0, sizeof(cc_mem)); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); if (ptr) free(ptr, M_CC_MEM); /* Release our temp reference */ CC_LIST_RLOCK(); cc_release(algo); CC_LIST_RUNLOCK(); return (ECONNRESET); } tp = intotcpcb(inp); if (ptr != NULL) memset(ptr, 0, mem_sz); cc_mem.ccvc.tcp = tp; /* * We once again hold a write lock over the tcb so it's * safe to do these things without ordering concerns. * Note here we init into stack memory. */ if (algo->cb_init != NULL) error = algo->cb_init(&cc_mem, ptr); else error = 0; /* * The CC algorithms, when given their memory * should not fail we could in theory have a * KASSERT here. */ if (error == 0) { /* * Touchdown, lets go ahead and move the * connection to the new CC module by * copying in the cc_mem after we call * the old ones cleanup (if any). */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); /* Detach the old CC from the tcpcb */ cc_detach(tp); /* Copy in our temp memory that was inited */ memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var)); /* Now attach the new, which takes a reference */ cc_attach(tp, algo); /* Ok now are we where we have gotten past any conn_init? */ if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { /* Yep run the connection init for the new CC */ CC_ALGO(tp)->conn_init(&tp->t_ccv); } } else if (ptr) free(ptr, M_CC_MEM); INP_WUNLOCK(inp); /* Now lets release our temp reference */ CC_LIST_RLOCK(); cc_release(algo); CC_LIST_RUNLOCK(); return (error); } int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt) { struct tcpcb *tp = intotcpcb(inp); int error, opt, optval; u_int ui; struct tcp_info ti; #ifdef KERN_TLS struct tls_enable tls; struct socket *so = inp->inp_socket; #endif char *pbuf, buf[TCP_LOG_ID_LEN]; #ifdef STATS struct statsblob *sbp; #endif size_t len; INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("inp_flags == %x", inp->inp_flags)); KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL")); switch (sopt->sopt_level) { #ifdef INET6 case IPPROTO_IPV6: MPASS(inp->inp_vflag & INP_IPV6PROTO); switch (sopt->sopt_name) { case IPV6_USE_MIN_MTU: tcp6_use_min_mtu(tp); /* FALLTHROUGH */ } INP_WUNLOCK(inp); return (0); #endif #ifdef INET case IPPROTO_IP: INP_WUNLOCK(inp); return (0); #endif } /* * For TCP_CCALGOOPT forward the control to CC module, for both * SOPT_SET and SOPT_GET. */ switch (sopt->sopt_name) { case TCP_CCALGOOPT: INP_WUNLOCK(inp); if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT) return (EINVAL); pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, sopt->sopt_valsize); if (error) { free(pbuf, M_TEMP); return (error); } INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); if (CC_ALGO(tp)->ctl_output != NULL) error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf); else error = ENOENT; INP_WUNLOCK(inp); if (error == 0 && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); free(pbuf, M_TEMP); return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: INP_WUNLOCK(inp); if (!TCPMD5_ENABLED()) return (ENOPROTOOPT); error = TCPMD5_PCBCTL(inp, sopt); if (error) return (error); INP_WLOCK_RECHECK(inp); goto unlock_and_done; #endif /* IPSEC */ case TCP_NODELAY: case TCP_NOOPT: case TCP_LRD: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; case TCP_LRD: opt = TF_LRD; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; unlock_and_done: #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); break; case TCP_NOPUSH: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = tcp_output_nodrop(tp); NET_EPOCH_EXIT(et); } } goto unlock_and_done; case TCP_REMOTE_UDP_ENCAPS_PORT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if ((optval < TCP_TUNNELING_PORT_MIN) || (optval > TCP_TUNNELING_PORT_MAX)) { /* Its got to be in range */ return (EINVAL); } if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) { /* You have to have enabled a UDP tunneling port first */ return (EINVAL); } INP_WLOCK_RECHECK(inp); if (tp->t_state != TCPS_CLOSED) { /* You can't change after you are connected */ error = EINVAL; } else { /* Ok we are all good set the port */ tp->t_port = htons(optval); } goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_STATS: INP_WUNLOCK(inp); #ifdef STATS error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if (optval > 0) sbp = stats_blob_alloc( V_tcp_perconn_stats_dflt_tpl, 0); else sbp = NULL; INP_WLOCK_RECHECK(inp); if ((tp->t_stats != NULL && sbp == NULL) || (tp->t_stats == NULL && sbp != NULL)) { struct statsblob *t = tp->t_stats; tp->t_stats = sbp; sbp = t; } INP_WUNLOCK(inp); stats_blob_destroy(sbp); #else return (EOPNOTSUPP); #endif /* !STATS */ break; case TCP_CONGESTION: error = tcp_set_cc_mod(inp, sopt); break; case TCP_REUSPORT_LB_NUMA: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); INP_WLOCK_RECHECK(inp); if (!error) error = in_pcblbgroup_numa(inp, optval); INP_WUNLOCK(inp); break; #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); error = copyin_tls_enable(sopt, &tls); if (error) break; error = ktls_enable_tx(so, &tls); break; case TCP_TXTLS_MODE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); error = ktls_set_tx_mode(so, ui); INP_WUNLOCK(inp); break; case TCP_RXTLS_ENABLE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &tls, sizeof(tls), sizeof(tls)); if (error) break; error = ktls_enable_rx(so, &tls); break; #endif case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); if (ui > (UINT_MAX / hz)) { error = EINVAL; break; } ui *= hz; INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_MAXUNACKTIME: tp->t_maxunacktime = ui; break; case TCP_KEEPIDLE: tp->t_keepidle = ui; /* * XXX: better check current remaining * timeout and "merge" it with new value. */ if ((tp->t_state > TCPS_LISTEN) && (tp->t_state <= TCPS_CLOSING)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); break; case TCP_KEEPINTVL: tp->t_keepintvl = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); break; case TCP_KEEPINIT: tp->t_keepinit = ui; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); break; } goto unlock_and_done; case TCP_KEEPCNT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); tp->t_keepcnt = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); goto unlock_and_done; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval >= 0) tcp_pcap_set_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else error = EINVAL; goto unlock_and_done; #endif case TCP_FASTOPEN: { struct tcp_fastopen tfo_optval; INP_WUNLOCK(inp); if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) return (EPERM); error = sooptcopyin(sopt, &tfo_optval, sizeof(tfo_optval), sizeof(int)); if (error) return (error); INP_WLOCK_RECHECK(inp); if ((tp->t_state != TCPS_CLOSED) && (tp->t_state != TCPS_LISTEN)) { error = EINVAL; goto unlock_and_done; } if (tfo_optval.enable) { if (tp->t_state == TCPS_LISTEN) { if (!V_tcp_fastopen_server_enable) { error = EPERM; goto unlock_and_done; } if (tp->t_tfo_pending == NULL) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); } else { /* * If a pre-shared key was provided, * stash it in the client cookie * field of the tcpcb for use during * connect. */ if (sopt->sopt_valsize == sizeof(tfo_optval)) { memcpy(tp->t_tfo_cookie.client, tfo_optval.psk, TCP_FASTOPEN_PSK_LEN); tp->t_tfo_client_cookie_len = TCP_FASTOPEN_PSK_LEN; } } tp->t_flags |= TF_FASTOPEN; } else tp->t_flags &= ~TF_FASTOPEN; goto unlock_and_done; } #ifdef TCP_BLACKBOX case TCP_LOG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); error = tcp_log_state_change(tp, optval); goto unlock_and_done; case TCP_LOGBUF: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_LOGID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); error = tcp_log_set_id(tp, buf); /* tcp_log_set_id() unlocks the INP. */ break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); if (sopt->sopt_name == TCP_LOGDUMP) { error = tcp_log_dump_tp_logbuf(tp, buf, M_WAITOK, true); INP_WUNLOCK(inp); } else { tcp_log_dump_tp_bucket_logbufs(tp, buf); /* * tcp_log_dump_tp_bucket_logbufs() drops the * INP lock. */ } break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: INP_WUNLOCK(inp); if (!TCPMD5_ENABLED()) return (ENOPROTOOPT); error = TCPMD5_PCBCTL(inp, sopt); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_REMOTE_UDP_ENCAPS_PORT: optval = ntohs(tp->t_port); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_STATS: { #ifdef STATS int nheld; TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; error = 0; socklen_t outsbsz = sopt->sopt_valsize; if (tp->t_stats == NULL) error = ENOENT; else if (outsbsz >= tp->t_stats->cursz) outsbsz = tp->t_stats->cursz; else if (outsbsz >= sizeof(struct statsblob)) outsbsz = sizeof(struct statsblob); else error = EINVAL; INP_WUNLOCK(inp); if (error) break; sbp = sopt->sopt_val; nheld = atop(round_page(((vm_offset_t)sbp) + (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp)); vm_page_t ma[nheld]; if (vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, nheld) < 0) { error = EFAULT; break; } if ((error = copyin_nofault(&(sbp->flags), &sbflags, SIZEOF_MEMBER(struct statsblob, flags)))) goto unhold; INP_WLOCK_RECHECK(inp); error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, sbflags | SB_CLONE_USRDSTNOFAULT); INP_WUNLOCK(inp); sopt->sopt_valsize = outsbsz; unhold: vm_page_unhold_pages(ma, nheld); #else INP_WUNLOCK(inp); error = EOPNOTSUPP; #endif /* !STATS */ break; } case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { case TCP_MAXUNACKTIME: ui = TP_MAXUNACKTIME(tp) / hz; break; case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; case TCP_KEEPINTVL: ui = TP_KEEPINTVL(tp) / hz; break; case TCP_KEEPINIT: ui = TP_KEEPINIT(tp) / hz; break; case TCP_KEEPCNT: ui = TP_KEEPCNT(tp); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #ifdef TCP_BLACKBOX case TCP_LOG: optval = tp->t_logstate; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_LOGBUF: /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ error = tcp_log_getlogbuf(sopt, tp); break; case TCP_LOGID: len = tcp_log_get_id(tp, buf); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = EINVAL; break; #endif #ifdef KERN_TLS case TCP_TXTLS_MODE: error = ktls_get_tx_mode(so, &optval); INP_WUNLOCK(inp); if (error == 0) error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_RXTLS_MODE: error = ktls_get_rx_mode(so, &optval); INP_WUNLOCK(inp); if (error == 0) error = sooptcopyout(sopt, &optval, sizeof(optval)); break; #endif case TCP_LRD: optval = tp->t_flags & TF_LRD; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #undef INP_WLOCK_RECHECK #undef INP_WLOCK_RECHECK_CLEANUP /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED && !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) /* Ignore stack's drop request, we already at it. */ (void)tcp_output_nodrop(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tptoinpcb(tp)); switch (tp->t_state) { case TCPS_LISTEN: #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_acktime == 0) tp->t_acktime = ticks; if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tptosocket(tp)); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_PREVVALID) { db_printf("%sTF_PREVVALID", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SONOTCONN) { db_printf("%sTF_SONOTCONN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_CONGRECOVERY) { db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASCRECOVERY) { db_printf("%sTF_WASCRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; } } static void db_print_tflags2(u_int t_flags2) { int comma; comma = 0; if (t_flags2 & TF2_PLPMTU_BLACKHOLE) { db_printf("%sTF2_PLPMTU_BLACKHOLE", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_PLPMTU_PMTUD) { db_printf("%sTF2_PLPMTU_PMTUD", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_PLPMTU_MAXSEGSNT) { db_printf("%sTF2_PLPMTU_MAXSEGSNT", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_LOG_AUTO) { db_printf("%sTF2_LOG_AUTO", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_DROP_AF_DATA) { db_printf("%sTF2_DROP_AF_DATA", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_ECN_PERMIT) { db_printf("%sTF2_ECN_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_ECN_SND_CWR) { db_printf("%sTF2_ECN_SND_CWR", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_ECN_SND_ECE) { db_printf("%sTF2_ECN_SND_ECE", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_ACE_PERMIT) { db_printf("%sTF2_ACE_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags2 & TF2_FBYTES_COMPLETE) { db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("t_callout: %p t_timers: %p\n", &tp->t_callout, &tp->t_timers); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags2: 0x%x (", tp->t_flags2); db_print_tflags2(tp->t_flags2); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %u snd_cwnd: %u\n", tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); db_printf("snd_ssthresh: %u snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_rcvtime: %u t_startime: %u\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin); db_print_indent(indent); db_printf("t_rttupdated: %u max_sndwnd: %u t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %u\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n", tp->snd_fack, tp->rcv_numsacks); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 51d29399e99b..aed0f26f89ba 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -1,1157 +1,1150 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { struct socket *so = inp->inp_socket; u_int16_t lport = 0; int error, lookupflags = 0; #ifdef INVARIANTS struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #endif INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); error = prison_local_ip6(cred, laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); if (error) return(error); /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags); if (error != 0) return (error); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } return (0); } int in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); } else { sin6 = (struct sockaddr_in6 *)nam; KASSERT(sin6->sin6_family == AF_INET6, ("%s: invalid address family for %p", __func__, sin6)); KASSERT(sin6->sin6_len == sizeof(*sin6), ("%s: invalid address length for %p", __func__, sin6)); if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if ((error = prison_local_ip6(cred, &sin6->sin6_addr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct epoch_tracker et; struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ NET_EPOCH_ENTER(et); if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } NET_EPOCH_EXIT(et); } if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } #endif } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); if (t && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); if (t && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_vflag & INP_IPV6PROTO) != 0)) { return (EADDRINUSE); } } #endif } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (error); } } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ static int in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct in6_addr *plocal_addr6) { int error = 0; int scope_ambiguous = 0; struct in6_addr in6a; struct epoch_tracker et; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if (!CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0) return (error); NET_EPOCH_ENTER(et); error = in6_selectsrc_socket(sin6, inp->in6p_outputopts, inp, inp->inp_cred, scope_ambiguous, &in6a, NULL); NET_EPOCH_EXIT(et); if (error) return (error); /* * Do not update this earlier, in case we return with an error. * * XXX: this in6_selectsrc_socket result might replace the bound local * address with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6a; /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int -in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, - struct ucred *cred, struct mbuf *m, bool rehash) +in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, + bool rehash) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct sockaddr_in6 laddr6; int error; KASSERT(sin6->sin6_family == AF_INET6, ("%s: invalid address family for %p", __func__, sin6)); KASSERT(sin6->sin6_len == sizeof(*sin6), ("%s: invalid address length for %p", __func__, sin6)); bzero(&laddr6, sizeof(laddr6)); laddr6.sin6_family = AF_INET6; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_type, hash_val; hash_val = fib6_calc_software_hash(&inp->in6p_laddr, &sin6->sin6_addr, 0, sin6->sin6_port, inp->inp_socket->so_proto->pr_protocol, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, sin6, &laddr6.sin6_addr)) != 0) return (error); if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &laddr6.sin6_addr : &inp->in6p_laddr, inp->inp_lport, 0, NULL, M_NODOM) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { /* * rehash was required to be true in the past for * this case; retain that convention. However, * we now call in_pcb_lport_dest rather than * in6_pcbbind; the former does not insert into * the hash table, the latter does. Change rehash * to false to do the in_pcbinshash below. */ KASSERT(rehash == true, ("Rehashing required for unbound inps")); rehash = false; error = in_pcb_lport_dest(inp, (struct sockaddr *) &laddr6, &inp->inp_lport, (struct sockaddr *) sin6, sin6->sin6_port, cred, INPLOOKUP_WILDCARD); if (error) return (error); } inp->in6p_laddr = laddr6.sin6_addr; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; if (inp->inp_flags & IN6P_AUTOFLOWLABEL) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); if (rehash) { in_pcbrehash(inp); } else { in_pcbinshash(inp); } return (0); } -int -in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) -{ - - return (in6_pcbconnect_mbuf(inp, nam, cred, NULL, true)); -} - void in6_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr)); memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ static bool inp_match6(const struct inpcb *inp, void *v __unused) { return ((inp->inp_vflag & INP_IPV6) != 0); } void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr_in6 *sa6_dst, u_int fport_arg, const struct sockaddr_in6 *src, u_int lport_arg, int errno, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, inp_match6, NULL); struct inpcb *inp; struct sockaddr_in6 sa6_src; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *src; flowinfo = sa6_src.sin6_flowinfo; while ((inp = inp_next(&inpi)) != NULL) { INP_WLOCK_ASSERT(inp); /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. * XXX: should we avoid to notify the value to TCP sockets? */ if (errno == EMSGSIZE && cmdarg != NULL) ip6_notify_pmtu(inp, sa6_dst, *(uint32_t *)cmdarg); /* * Detect if we should notify the error. If no source and * destination ports are specified, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { continue; } do_notify: if (notify) (*notify)(inp, errno); } } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; int matchwild = 3, wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* Found. */ if (prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (!prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } static bool in6_multi_match(const struct inpcb *inp, void *v __unused) { if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL) return (true); else return (false); } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB, in6_multi_match, NULL); struct inpcb *inp; struct in6_multi *inm; struct in6_mfilter *imf; struct ip6_moptions *im6o; IN6_MULTI_LOCK_ASSERT(); while ((inp = inp_next(&inpi)) != NULL) { INP_RLOCK_ASSERT(inp); im6o = inp->in6p_moptions; /* * Unselect the outgoing ifp for multicast if it * is being detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ restart: IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { if ((inm = imf->im6f_in6m) == NULL) continue; if (inm->in6m_ifp != ifp) continue; ip6_mfilter_remove(&im6o->im6o_head, imf); in6_leavegroup_locked(inm, NULL); ip6_mfilter_free(imf); goto restart; } } } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route6); } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno __unused) { RO_INVALIDATE_CACHE(&inp->inp_route6); return inp; } static bool in6_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) { return (domain == M_NODOM || domain == grp->il_numa_domain); } static struct inpcb * in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, uint16_t fport, int lookupflags, uint8_t domain) { const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Search for an LB group match based on the following criteria: * - prefer jailed groups to non-jailed groups * - prefer exact source address matches to wildcard matches * - prefer groups bound to the specified NUMA domain */ jail_exact = jail_wild = local_exact = local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { bool injail; #ifdef INET if (!(grp->il_vflag & INP_IPV6)) continue; #endif if (grp->il_lport != lport) continue; injail = prison_flag(grp->il_cred, PR_IP6) != 0; if (injail && prison_check_ip6_locked(grp->il_cred->cr_prison, laddr) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) { if (injail) { jail_exact = grp; if (in6_pcblookup_lb_numa_match(grp, domain)) /* This is a perfect match. */ goto out; } else if (local_exact == NULL || in6_pcblookup_lb_numa_match(grp, domain)) { local_exact = grp; } } else if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) && (lookupflags & INPLOOKUP_WILDCARD) != 0) { if (injail) { if (jail_wild == NULL || in6_pcblookup_lb_numa_match(grp, domain)) jail_wild = grp; } else if (local_wild == NULL || in6_pcblookup_lb_numa_match(grp, domain)) { local_wild = grp; } } } if (jail_exact != NULL) grp = jail_exact; else if (jail_wild != NULL) grp = jail_wild; else if (local_exact != NULL) grp = local_exact; else grp = local_wild; if (grp == NULL) return (NULL); out: return (grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) % grp->il_inpcnt]); } /* * Lookup PCB in hash list. Used in in_pcb.c as well as here. */ struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT(!IN6_IS_ADDR_UNSPECIFIED(faddr), ("%s: invalid foreign address", __func__)); KASSERT(!IN6_IS_ADDR_UNSPECIFIED(laddr), ("%s: invalid local address", __func__)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(faddr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * First see if an LB group matches the request before scanning * all sockets on this port. */ inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr, fport, lookupflags, numa_domain); if (inp != NULL) return (inp); /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6_locked( inp->inp_cred->cr_prison, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) return (inp); else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ /* * Not found. */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); smr_enter(pcbinfo->ipi_smr); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { if (__predict_false(inp_smr_lock(inp, (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; } else smr_exit(pcbinfo->ipi_smr); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, M_NODOM)); } struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, m->m_pkthdr.numa_domain)); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } diff --git a/sys/netinet6/in6_pcb.h b/sys/netinet6/in6_pcb.h index 09907e2c397c..bf84576736e4 100644 --- a/sys/netinet6/in6_pcb.h +++ b/sys/netinet6/in6_pcb.h @@ -1,115 +1,113 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.h,v 1.13 2001/02/06 09:16:53 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IN6_PCB_H_ #define _NETINET6_IN6_PCB_H_ #ifdef _KERNEL #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) void in6_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); void in6_losing(struct inpcb *); int in6_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); -int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); -int in6_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, - struct ucred *, struct mbuf *, bool); +int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *, bool); void in6_pcbdisconnect(struct inpcb *); struct inpcb * in6_pcblookup_local(struct inpcbinfo *, struct in6_addr *, u_short, int, struct ucred *); struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t); struct inpcb * in6_pcblookup(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *ifp, struct mbuf *); void in6_pcbnotify(struct inpcbinfo *, struct sockaddr_in6 *, u_int, const struct sockaddr_in6 *, u_int, int, void *, struct inpcb *(*)(struct inpcb *, int)); struct inpcb * in6_rtchange(struct inpcb *, int); struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p); struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p); int in6_getpeeraddr(struct socket *so, struct sockaddr **nam); int in6_getsockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); int in6_selecthlim(struct inpcb *, struct ifnet *); int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct ucred *); void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int); #endif /* _KERNEL */ #endif /* !_NETINET6_IN6_PCB_H_ */ diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index b79a00280110..1ab838d60ac0 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -1,1286 +1,1286 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(int, zero_checksum_port) = 0; #define V_zero_checksum_port VNET(zero_checksum_port) SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(zero_checksum_port), 0, "Zero UDP checksum allowed for traffic to/from this port."); /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ static void udp6_detach(struct socket *so); static int udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts = NULL, *tmp_opts; struct udpcb *up; bool filtered; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&fromsa[0], up->u_tun_ctx); INP_RLOCK(inp); if (filtered) return (in_pcbrele_rlocked(inp)); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) { m_freem(n); return (0); } } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmp_opts = sbcreatecontrol(&fromsa[1], sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR, IPPROTO_IPV6, M_NOWAIT); if (tmp_opts) { if (opts) { tmp_opts->m_next = opts; opts = tmp_opts; } else opts = tmp_opts; } } m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n, opts) == 0) { soroverflow_locked(so); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } struct udp6_multi_match_ctx { struct ip6_hdr *ip6; struct udphdr *uh; }; static bool udp6_multi_match(const struct inpcb *inp, void *v) { struct udp6_multi_match_ctx *ctx = v; if ((inp->inp_vflag & INP_IPV6) == 0) return(false); if (inp->inp_lport != ctx->uh->uh_dport) return(false); if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport) return(false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst)) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) || inp->inp_fport != ctx->uh->uh_sport)) return (false); return (true); } static int udp6_multi_input(struct mbuf *m, int off, int proto, struct sockaddr_in6 *fromsa) { struct udp6_multi_match_ctx ctx; struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx); struct inpcb *inp; struct ip6_moptions *imo; struct mbuf *n; int appends = 0; /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ ctx.ip6 = mtod(m, struct ip6_hdr *); ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off); while ((inp = inp_next(&inpi)) != NULL) { INP_RLOCK_ASSERT(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ if ((imo = inp->in6p_moptions) != NULL) { struct sockaddr_in6 mcaddr; int blocked; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ctx.ip6->ip6_dst; blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); continue; } } if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ctx.ip6, inp, ctx.uh); else UDP_PROBE(receive, NULL, inp, ctx.ip6, inp, ctx.uh); if (udp6_append(inp, n, off, fromsa)) { break; } else appends++; } /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { INP_RUNLOCK(inp); break; } } m_freem(m); if (appends == 0) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); } return (IPPROTO_DONE); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; struct sockaddr_in6 fromsa[2]; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; NET_EPOCH_ASSERT(); if (m->m_len < off + sizeof(struct udphdr)) { m = m_pullup(m, off + sizeof(struct udphdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = proto; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); /* * dport 0 was rejected earlier so this is OK even if * zero_checksum_port is 0 (which is its default value). */ if (ntohs(uh->uh_dport) == V_zero_checksum_port) goto skip_checksum; else goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } skip_checksum: /* * Construct sockaddr format source address. */ init_sin6(&fromsa[0], m, 0); fromsa[0].sin6_port = uh->uh_sport; init_sin6(&fromsa[1], m, 1); fromsa[1].sin6_port = uh->uh_dport; pcbinfo = udp_get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { *mp = NULL; return (udp6_multi_input(m, off, proto, fromsa)); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (V_udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, NULL, ip6, NULL, uh); else UDP_PROBE(receive, NULL, NULL, ip6, NULL, uh); UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole && (V_udp_blackhole_local || !in6_localaddr(&ip6->ip6_src))) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); *mp = NULL; return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); *mp = NULL; return (IPPROTO_DONE); } } if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip6, inp, uh); else UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, fromsa) == 0) INP_RUNLOCK(inp); *mp = NULL; return (IPPROTO_DONE); badunlocked: m_freem(m); *mp = NULL; return (IPPROTO_DONE); } static void udp6_common_ctlinput(struct ip6ctlparam *ip6cp, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; int errno, off = 0; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if ((errno = icmp6_errmap(ip6cp->ip6c_icmp6)) == 0) return; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); /* Check to see if its tunneled */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst, uh.uh_dport, &ip6->ip6_src, uh.uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp != NULL) { struct udpcb *up; udp_tun_icmp_t *func; up = intoudpcb(inp); func = up->u_icmp_func; INP_RUNLOCK(inp); if (func != NULL) func(ip6cp); } in6_pcbnotify(pcbinfo, ip6cp->ip6c_finaldst, uh.uh_dport, ip6cp->ip6c_src, uh.uh_sport, errno, ip6cp->ip6c_cmdarg, udp_notify); } static void udp6_ctlinput(struct ip6ctlparam *ctl) { return (udp6_common_ctlinput(ctl, &V_udbinfo)); } static void udplite6_ctlinput(struct ip6ctlparam *ctl) { return (udp6_common_ctlinput(ctl, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } NET_EPOCH_ENTER(et); inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_send(struct socket *so, int flags_arg, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct ip6_pktopts *optp, opt; struct sockaddr_in6 *sin6, tmp; struct epoch_tracker et; int cscov_partial, error, flags, hlen, scope_ambiguous; u_int32_t ulen, plen; uint16_t cscov; u_short fport; uint8_t nxt; if (addr6) { error = 0; if (addr6->sa_family != AF_INET6) error = EAFNOSUPPORT; else if (addr6->sa_len != sizeof(struct sockaddr_in6)) error = EINVAL; if (__predict_false(error != 0)) { m_freem(control); m_freem(m); return (error); } } sin6 = (struct sockaddr_in6 *)addr6; /* * In contrast to IPv4 we do not validate the max. packet length * here due to IPv6 Jumbograms (RFC2675). */ scope_ambiguous = 0; if (sin6) { /* Protect *addr6 from overwrites. */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) { if (control) m_freem(control); m_freem(m); return (error); } } inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * In the following cases we want a write lock on the inp for either * local operations or for possible route cache updates in the IPv6 * output path: * - on connected sockets (sin6 is NULL) for route cache updates, * - when we are not bound to an address and source port (it is * in6_pcbsetport() which will require the write lock). * * We check the inp fields before actually locking the inp, so * here exists a race, and we may WLOCK the inp and end with already * bound one by other thread. This is fine. */ if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0)) INP_WLOCK(inp); else INP_RLOCK(inp); nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; if (sin6 == NULL) hasv4addr = (inp->inp_vflag & INP_IPV4); else hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; if (hasv4addr) { /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_UNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); /* addr will just be freed in sendit(). */ return (udp_send(so, flags_arg | PRUS_IPV6, m, (struct sockaddr *)sin6, control, td)); } } else #endif if (sin6 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* * Given this is either an IPv6-only socket or no INET is * supported we will fail the send if the given destination * address is a v4mapped address. * * XXXGL: do we leak m and control? */ INP_UNLOCK(inp); return (EINVAL); } NET_EPOCH_ENTER(et); if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { goto release; } optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } /* * Given we handle the v4mapped case in the INET block above * assert here that it must not happen anymore. */ KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr), ("%s: sin6(%p)->sin6_addr is v4mapped which we " "should have handled.", __func__, sin6)); /* This only requires read-locking. */ error = in6_selectsrc_socket(sin6, optp, inp, td->td_ucred, scope_ambiguous, &in6a, NULL); if (error) goto release; laddr = &in6a; if (inp->inp_lport == 0) { struct inpcbinfo *pcbinfo; INP_WLOCK_ASSERT(inp); pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); INP_HASH_WLOCK(pcbinfo); error = in6_pcbsetport(laddr, inp, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } faddr = &sin6->sin6_addr; fport = sin6->sin6_port; /* allow 0 port */ } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } ulen = m->m_pkthdr.len; plen = sizeof(struct udphdr) + ulen; hlen = sizeof(struct ip6_hdr); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ cscov = cscov_partial = 0; udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } flags = 0; #if defined(ROUTE_MPATH) || defined(RSS) if (CALC_FLOWID_OUTBOUND_SENDTO) { uint32_t hash_type, hash_val; uint8_t pr; pr = inp->inp_socket->so_proto->pr_protocol; hash_val = fib6_calc_packet_hash(laddr, faddr, inp->inp_lport, fport, pr, &hash_type); m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } /* do not use inp flowid */ flags |= IP_NODEFAULTFLOWID; #endif UDPSTAT_INC(udps_opackets); if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); error = ip6_output(m, optp, INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); INP_UNLOCK(inp); NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); release: INP_UNLOCK(inp); NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } m_freem(m); return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); udp_abort(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcbinfo *pcbinfo; struct inpcb *inp; struct udpcb *up; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } error = in_pcballoc(so, pcbinfo); if (error) return (error); inp = (struct inpcb *)so->so_pcb; inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; up = intoudpcb(inp); bzero(&up->u_start_zero, u_zero_size); INP_WUNLOCK(inp); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof(struct sockaddr_in6)) return (EINVAL); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); vflagsav = inp->inp_vflag; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif if (error != 0) inp->inp_vflag = vflagsav; INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); (void)udp_disconnect(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { #ifdef INET struct epoch_tracker et; #endif struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_len != sizeof(*sin6)) return (EINVAL); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; vflagsav = inp->inp_vflag; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred, true); INP_HASH_WUNLOCK(pcbinfo); NET_EPOCH_EXIT(et); /* * If connect succeeds, mark socket as connected. If * connect fails and socket is unbound, reset inp_vflag * field. */ if (error == 0) soisconnected(so); else if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) inp->inp_vflag = vflagsav; goto out; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; vflagsav = inp->inp_vflag; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; INP_HASH_WLOCK(pcbinfo); - error = in6_pcbconnect(inp, nam, td->td_ucred); + error = in6_pcbconnect(inp, nam, td->td_ucred, true); INP_HASH_WUNLOCK(pcbinfo); /* * If connect succeeds, mark socket as connected. If * connect fails and socket is unbound, reset inp_vflag * field. */ if (error == 0) soisconnected(so); else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) inp->inp_vflag = vflagsav; out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { INP_WUNLOCK(inp); (void)udp_disconnect(so); return (0); } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } #define UDP6_PROTOSW \ .pr_type = SOCK_DGRAM, \ .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, \ .pr_ctloutput = ip6_ctloutput, \ .pr_abort = udp6_abort, \ .pr_attach = udp6_attach, \ .pr_bind = udp6_bind, \ .pr_connect = udp6_connect, \ .pr_control = in6_control, \ .pr_detach = udp6_detach, \ .pr_disconnect = udp6_disconnect, \ .pr_peeraddr = in6_mapped_peeraddr, \ .pr_send = udp6_send, \ .pr_shutdown = udp_shutdown, \ .pr_sockaddr = in6_mapped_sockaddr, \ .pr_soreceive = soreceive_dgram, \ .pr_sosend = sosend_dgram, \ .pr_sosetlabel = in_pcbsosetlabel, \ .pr_close = udp6_close struct protosw udp6_protosw = { .pr_protocol = IPPROTO_UDP, UDP6_PROTOSW }; struct protosw udplite6_protosw = { .pr_protocol = IPPROTO_UDPLITE, UDP6_PROTOSW }; static void udp6_init(void *arg __unused) { IP6PROTO_REGISTER(IPPROTO_UDP, udp6_input, udp6_ctlinput); IP6PROTO_REGISTER(IPPROTO_UDPLITE, udp6_input, udplite6_ctlinput); } SYSINIT(udp6_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp6_init, NULL);