diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index 2e307975b9ca..1c0c5624b6d7 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -1,1785 +1,1761 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_kern_tls.h" #include #include #include #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Primary Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Primary Zone, * thus sharing backend Slab kegs with the Mbuf Primary Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Primary Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ bool mb_use_ext_pgs = false; /* use M_EXTPG mbufs for sendfile & TLS */ static int sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS) { int error, extpg; extpg = mb_use_ext_pgs; error = sysctl_handle_int(oidp, &extpg, 0, req); if (error == 0 && req->newptr != NULL) { if (extpg != 0 && !PMAP_HAS_DMAP) error = EOPNOTSUPP; else mb_use_ext_pgs = extpg != 0; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLTYPE_INT | CTLFLAG_RW, &mb_use_ext_pgs, 0, sysctl_mb_use_ext_pgs, "IU", "Use unmapped mbufs for sendfile(2) and TLS offload"); static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); static counter_u64_t snd_tag_count; SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, &snd_tag_count, "# of active mbuf send tags"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; int extpg; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); /* * Unmapped mbufs can only safely be used on platforms with a direct * map. */ if (PMAP_HAS_DMAP) { extpg = 1; TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg); mb_use_ext_pgs = extpg != 0; } } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); _Static_assert(sizeof(struct mbuf) <= MSIZE, "size of mbuf exceeds MSIZE"); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); - /* - * Hook event handler for low-memory situation, used to - * drain protocols and push data back to the caches (UMA - * later pushes it back to VM). - */ - EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, - EVENTHANDLER_PRI_FIRST); - snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); #ifdef DEBUGNET /* * debugnet makes use of a pre-allocated pool of mbufs and clusters. When * debugnet is configured, we initialize a set of UMA cache zones which return * items from this pool. At panic-time, the regular UMA zone pointers are * overwritten with those of the cache zones so that drivers may allocate and * free mbufs and clusters without attempting to allocate physical memory. * * We keep mbufs and clusters in a pair of mbuf queues. In particular, for * the purpose of caching clusters, we treat them as mbufs. */ static struct mbufq dn_mbufq = { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; static struct mbufq dn_clustq = { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; static int dn_clsize; static uma_zone_t dn_zone_mbuf; static uma_zone_t dn_zone_clust; static uma_zone_t dn_zone_pack; static struct debugnet_saved_zones { uma_zone_t dsz_mbuf; uma_zone_t dsz_clust; uma_zone_t dsz_pack; uma_zone_t dsz_jumbop; uma_zone_t dsz_jumbo9; uma_zone_t dsz_jumbo16; bool dsz_debugnet_zones_enabled; } dn_saved_zones; static int dn_buf_import(void *arg, void **store, int count, int domain __unused, int flags) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = mbufq_dequeue(q); if (m == NULL) break; trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_buf_release(void *arg, void **store, int count) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = store[i]; (void)mbufq_enqueue(q, m); } } static int dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, int flags __unused) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = m_get(MT_DATA, M_NOWAIT); if (m == NULL) break; clust = uma_zalloc(dn_zone_clust, M_NOWAIT); if (clust == NULL) { m_free(m); break; } mb_ctor_clust(clust, dn_clsize, m, 0); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_pack_release(void *arg __unused, void **store, int count) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = store[i]; clust = m->m_ext.ext_buf; uma_zfree(dn_zone_clust, clust); uma_zfree(dn_zone_mbuf, m); } } /* * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy * the corresponding UMA cache zones. */ void debugnet_mbuf_drain(void) { struct mbuf *m; void *item; if (dn_zone_mbuf != NULL) { uma_zdestroy(dn_zone_mbuf); dn_zone_mbuf = NULL; } if (dn_zone_clust != NULL) { uma_zdestroy(dn_zone_clust); dn_zone_clust = NULL; } if (dn_zone_pack != NULL) { uma_zdestroy(dn_zone_pack); dn_zone_pack = NULL; } while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) m_free(m); while ((item = mbufq_dequeue(&dn_clustq)) != NULL) uma_zfree(m_getzone(dn_clsize), item); } /* * Callback invoked immediately prior to starting a debugnet connection. */ void debugnet_mbuf_start(void) { MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); /* Save the old zone pointers to restore when debugnet is closed. */ dn_saved_zones = (struct debugnet_saved_zones) { .dsz_debugnet_zones_enabled = true, .dsz_mbuf = zone_mbuf, .dsz_clust = zone_clust, .dsz_pack = zone_pack, .dsz_jumbop = zone_jumbop, .dsz_jumbo9 = zone_jumbo9, .dsz_jumbo16 = zone_jumbo16, }; /* * All cluster zones return buffers of the size requested by the * drivers. It's up to the driver to reinitialize the zones if the * MTU of a debugnet-enabled interface changes. */ printf("debugnet: overwriting mbuf zone pointers\n"); zone_mbuf = dn_zone_mbuf; zone_clust = dn_zone_clust; zone_pack = dn_zone_pack; zone_jumbop = dn_zone_clust; zone_jumbo9 = dn_zone_clust; zone_jumbo16 = dn_zone_clust; } /* * Callback invoked when a debugnet connection is closed/finished. */ void debugnet_mbuf_finish(void) { MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); printf("debugnet: restoring mbuf zone pointers\n"); zone_mbuf = dn_saved_zones.dsz_mbuf; zone_clust = dn_saved_zones.dsz_clust; zone_pack = dn_saved_zones.dsz_pack; zone_jumbop = dn_saved_zones.dsz_jumbop; zone_jumbo9 = dn_saved_zones.dsz_jumbo9; zone_jumbo16 = dn_saved_zones.dsz_jumbo16; memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); } /* * Reinitialize the debugnet mbuf+cluster pool and cache zones. */ void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) { struct mbuf *m; void *item; debugnet_mbuf_drain(); dn_clsize = clsize; dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, dn_buf_import, dn_buf_release, &dn_mbufq, UMA_ZONE_NOBUCKET); dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, clsize, mb_ctor_clust, NULL, NULL, NULL, dn_buf_import, dn_buf_release, &dn_clustq, UMA_ZONE_NOBUCKET); dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, dn_pack_import, dn_pack_release, NULL, UMA_ZONE_NOBUCKET); while (nmbuf-- > 0) { m = m_get(MT_DATA, M_WAITOK); uma_zfree(dn_zone_mbuf, m); } while (nclust-- > 0) { item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); uma_zfree(dn_zone_clust, item); } } #endif /* DEBUGNET */ /* * Constructor for Mbuf primary zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf primary zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags __diagused; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, cause them to be woken up by draining the * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted(zone_clust)) uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #if defined(INVARIANTS) && !defined(KMSAN) trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #if defined(INVARIANTS) && !defined(KMSAN) trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #if defined(INVARIANTS) && !defined(KMSAN) trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. - * - * No locks should be held when this is called. The drain routines have to - * presently acquire some locks which raises the possibility of lock order - * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { - struct epoch_tracker et; - struct domain *dp; - struct protosw *pr; - - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); - - NET_EPOCH_ENTER(et); - for (dp = domains; dp != NULL; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) - if (pr->pr_drain != NULL) - (*pr->pr_drain)(); - NET_EPOCH_EXIT(et); + + EVENTHANDLER_INVOKE(mbuf_lowmem, VM_LOW_MBUFS); } /* * Free "count" units of I/O from an mbuf chain. They could be held * in M_EXTPG or just as a normal mbuf. This code is intended to be * called in an error path (I/O error, closed connection, etc). */ void mb_free_notready(struct mbuf *m, int count) { int i; for (i = 0; i < count && m != NULL; i++) { if ((m->m_flags & M_EXTPG) != 0) { m->m_epg_nrdy--; if (m->m_epg_nrdy != 0) continue; } m = m_free(m); } KASSERT(i == count, ("Removed only %d items from %p", i, m)); } /* * Compress an unmapped mbuf into a simple mbuf when it holds a small * amount of data. This is used as a DOS defense to avoid having * small packets tie up wired pages, an ext_pgs structure, and an * mbuf. Since this converts the existing mbuf in place, it can only * be used if there are no other references to 'm'. */ int mb_unmapped_compress(struct mbuf *m) { volatile u_int *refcnt; char buf[MLEN]; /* * Assert that 'm' does not have a packet header. If 'm' had * a packet header, it would only be able to hold MHLEN bytes * and m_data would have to be initialized differently. */ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt != 1) return (EBUSY); m_copydata(m, 0, m->m_len, buf); /* Free the backing pages. */ m->m_ext.ext_free(m); /* Turn 'm' into a "normal" mbuf. */ m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); m->m_data = m->m_dat; /* Copy data back into m. */ bcopy(buf, mtod(m, char *), m->m_len); return (0); } /* * These next few routines are used to permit downgrading an unmapped * mbuf to a chain of mapped mbufs. This is used when an interface * doesn't supported unmapped mbufs or if checksums need to be * computed in software. * * Each unmapped mbuf is converted to a chain of mbufs. First, any * TLS header data is stored in a regular mbuf. Second, each page of * unmapped data is stored in an mbuf with an EXT_SFBUF external * cluster. These mbufs use an sf_buf to provide a valid KVA for the * associated physical page. They also hold a reference on the * original M_EXTPG mbuf to ensure the physical page doesn't go away. * Finally, any TLS trailer data is stored in a regular mbuf. * * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF * mbufs. It frees the associated sf_buf and releases its reference * on the original M_EXTPG mbuf. * * _mb_unmapped_to_ext() is a helper function that converts a single * unmapped mbuf into a chain of mbufs. * * mb_unmapped_to_ext() is the public function that walks an mbuf * chain converting any unmapped mbufs to mapped mbufs. It returns * the new chain of unmapped mbufs on success. On failure it frees * the original mbuf chain and returns NULL. */ static void mb_unmapped_free_mext(struct mbuf *m) { struct sf_buf *sf; struct mbuf *old_m; sf = m->m_ext.ext_arg1; sf_buf_free(sf); /* Drop the reference on the backing M_EXTPG mbuf. */ old_m = m->m_ext.ext_arg2; mb_free_extpg(old_m); } static struct mbuf * _mb_unmapped_to_ext(struct mbuf *m) { struct mbuf *m_new, *top, *prev, *mref; struct sf_buf *sf; vm_page_t pg; int i, len, off, pglen, pgoff, seglen, segoff; volatile u_int *refcnt; u_int ref_inc = 0; M_ASSERTEXTPG(m); len = m->m_len; KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Skip over any data removed from the front. */ off = mtod(m, vm_offset_t); top = NULL; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; m_new->m_len = seglen; prev = top = m_new; memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], seglen); } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) { top = prev = m_new; } else { prev->m_next = m_new; prev = m_new; } sf = sf_buf_alloc(pg, SFB_NOWAIT); if (sf == NULL) goto fail; ref_inc++; m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); m_new->m_data += segoff; m_new->m_len = seglen; pgoff = 0; }; if (len != 0) { KASSERT((off + len) <= m->m_epg_trllen, ("off + len > trail (%d + %d > %d)", off, len, m->m_epg_trllen)); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) top = m_new; else prev->m_next = m_new; m_new->m_len = len; memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); } if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be dropped * in mb_unmapped_free_mext(). */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); return (top); fail: if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be * immediately dropped when these mbufs are freed * below. */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); m_freem(top); return (NULL); } struct mbuf * mb_unmapped_to_ext(struct mbuf *top) { struct mbuf *m, *next, *prev = NULL; prev = NULL; for (m = top; m != NULL; m = next) { /* m might be freed, so cache the next pointer. */ next = m->m_next; if (m->m_flags & M_EXTPG) { if (prev != NULL) { /* * Remove 'm' from the new chain so * that the 'top' chain terminates * before 'm' in case 'top' is freed * due to an error. */ prev->m_next = NULL; } m = _mb_unmapped_to_ext(m); if (m == NULL) { m_freem(top); m_freem(next); return (NULL); } if (prev == NULL) { top = m; } else { prev->m_next = m; } /* * Replaced one mbuf with a chain, so we must * find the end of chain. */ prev = m_last(m); } else { if (prev != NULL) { prev->m_next = m; } prev = m; } } return (top); } /* * Allocate an empty M_EXTPG mbuf. The ext_free routine is * responsible for freeing any pages backing this mbuf when it is * freed. */ struct mbuf * mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) { struct mbuf *m; m = m_get(how, MT_DATA); if (m == NULL) return (NULL); m->m_epg_npgs = 0; m->m_epg_nrdy = 0; m->m_epg_1st_off = 0; m->m_epg_last_len = 0; m->m_epg_flags = 0; m->m_epg_hdrlen = 0; m->m_epg_trllen = 0; m->m_epg_tls = NULL; m->m_epg_so = NULL; m->m_data = NULL; m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_ext.ext_size = 0; m->m_ext.ext_free = ext_free; return (m); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF || m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); m_free_raw(mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; case EXT_RXRING: KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free is set", __func__)); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) m_free_raw(m); } /* * Clean up after mbufs with M_EXTPG storage attached to them if the * reference count hits 1. */ void mb_free_extpg(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; M_ASSERTEXTPG(m); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); #ifdef KERN_TLS if (mref->m_epg_tls != NULL && !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) ktls_enqueue_to_free(mref); else #endif m_free_raw(mref); } if (m != mref) m_free_raw(m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_get3() allocates minimum mbuf that would fit "size" argument. * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. */ struct mbuf * m_get3(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size <= MJUMPAGESIZE) return (m_get2(size, how, type, flags)); if (size > MJUM16BYTES) return (NULL); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); if (size <= MJUM9BYTES) zone = zone_jumbo9; else zone = zone_jumbo16; n = uma_zalloc_arg(zone, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } MBUF_PROBE5(m__getjcl, how, type, flags, size, m); return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { mb = NULL; if (len > MCLBYTES) { mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), MJUMPAGESIZE); } if (mb == NULL) { if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* * Fail the whole operation if one mbuf can't be * allocated. */ if (mb == NULL) { m_freem(nm); return (NULL); } } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } /* * Temporary primitive to allow freeing without going through m_free. */ void m_free_raw(struct mbuf *mb) { uma_zfree(zone_mbuf, mb); } int m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **mstp) { if (ifp->if_snd_tag_alloc == NULL) return (EOPNOTSUPP); return (ifp->if_snd_tag_alloc(ifp, params, mstp)); } void m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, const struct if_snd_tag_sw *sw) { if_ref(ifp); mst->ifp = ifp; refcount_init(&mst->refcount, 1); mst->sw = sw; counter_u64_add(snd_tag_count, 1); } void m_snd_tag_destroy(struct m_snd_tag *mst) { struct ifnet *ifp; ifp = mst->ifp; mst->sw->snd_tag_free(mst); if_rele(ifp); counter_u64_add(snd_tag_count, -1); } void m_rcvif_serialize(struct mbuf *m) { u_short idx, gen; M_ASSERTPKTHDR(m); idx = m->m_pkthdr.rcvif->if_index; gen = m->m_pkthdr.rcvif->if_idxgen; m->m_pkthdr.rcvidx = idx; m->m_pkthdr.rcvgen = gen; if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) { idx = m->m_pkthdr.leaf_rcvif->if_index; gen = m->m_pkthdr.leaf_rcvif->if_idxgen; } else { idx = -1; gen = 0; } m->m_pkthdr.leaf_rcvidx = idx; m->m_pkthdr.leaf_rcvgen = gen; } struct ifnet * m_rcvif_restore(struct mbuf *m) { struct ifnet *ifp, *leaf_ifp; M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) { leaf_ifp = NULL; } else { leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx, m->m_pkthdr.leaf_rcvgen); if (__predict_false(leaf_ifp != NULL && (leaf_ifp->if_flags & IFF_DYING))) leaf_ifp = NULL; } m->m_pkthdr.leaf_rcvif = leaf_ifp; m->m_pkthdr.rcvif = ifp; return (ifp); } /* * Allocate an mbuf with anonymous external pages. */ struct mbuf * mb_alloc_ext_plus_pages(int len, int how) { struct mbuf *m; vm_page_t pg; int i, npgs; m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (m == NULL) return (NULL); m->m_epg_flags |= EPG_FLAG_ANON; npgs = howmany(len, PAGE_SIZE); for (i = 0; i < npgs; i++) { do { pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) { if (how == M_NOWAIT) { m->m_epg_npgs = i; m_free(m); return (NULL); } vm_wait(NULL); } } while (pg == NULL); m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); } m->m_epg_npgs = npgs; return (m); } /* * Copy the data in the mbuf chain to a chain of mbufs with anonymous external * unmapped pages. * len is the length of data in the input mbuf chain. * mlen is the maximum number of bytes put into each ext_page mbuf. */ struct mbuf * mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, struct mbuf **mlast) { struct mbuf *m, *mout; char *pgpos, *mbpos; int i, mblen, mbufsiz, pglen, xfer; if (len == 0) return (NULL); mbufsiz = min(mlen, len); m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); if (m == NULL) return (m); pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); pglen = PAGE_SIZE; mblen = 0; i = 0; do { if (pglen == 0) { if (++i == m->m_epg_npgs) { m->m_epg_last_len = PAGE_SIZE; mbufsiz = min(mlen, len); m->m_next = mb_alloc_ext_plus_pages(mbufsiz, how); m = m->m_next; if (m == NULL) { m_freem(mout); return (m); } i = 0; } pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); pglen = PAGE_SIZE; } while (mblen == 0) { if (mp == NULL) { m_freem(mout); return (NULL); } KASSERT((mp->m_flags & M_EXTPG) == 0, ("mb_copym_ext_pgs: ext_pgs input mbuf")); mbpos = mtod(mp, char *); mblen = mp->m_len; mp = mp->m_next; } xfer = min(mblen, pglen); memcpy(pgpos, mbpos, xfer); pgpos += xfer; mbpos += xfer; pglen -= xfer; mblen -= xfer; len -= xfer; m->m_len += xfer; } while (len > 0); m->m_epg_last_len = PAGE_SIZE - pglen; if (mlast != NULL) *mlast = m; return (mout); } diff --git a/sys/kern/uipc_debug.c b/sys/kern/uipc_debug.c index 5f96850431a0..3f54e3e46f26 100644 --- a/sys/kern/uipc_debug.c +++ b/sys/kern/uipc_debug.c @@ -1,517 +1,514 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Debugger routines relating to sockets, protocols, etc, for use in DDB. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #ifdef DDB #include static void db_print_sotype(short so_type) { switch (so_type) { case SOCK_STREAM: db_printf("SOCK_STREAM"); break; case SOCK_DGRAM: db_printf("SOCK_DGRAM"); break; case SOCK_RAW: db_printf("SOCK_RAW"); break; case SOCK_RDM: db_printf("SOCK_RDM"); break; case SOCK_SEQPACKET: db_printf("SOCK_SEQPACKET"); break; default: db_printf("unknown"); break; } } static void db_print_sooptions(int so_options) { int comma; comma = 0; if (so_options & SO_DEBUG) { db_printf("%sSO_DEBUG", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTCONN) { db_printf("%sSO_ACCEPTCONN", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEADDR) { db_printf("%sSO_REUSEADDR", comma ? ", " : ""); comma = 1; } if (so_options & SO_KEEPALIVE) { db_printf("%sSO_KEEPALIVE", comma ? ", " : ""); comma = 1; } if (so_options & SO_DONTROUTE) { db_printf("%sSO_DONTROUTE", comma ? ", " : ""); comma = 1; } if (so_options & SO_BROADCAST) { db_printf("%sSO_BROADCAST", comma ? ", " : ""); comma = 1; } if (so_options & SO_USELOOPBACK) { db_printf("%sSO_USELOOPBACK", comma ? ", " : ""); comma = 1; } if (so_options & SO_LINGER) { db_printf("%sSO_LINGER", comma ? ", " : ""); comma = 1; } if (so_options & SO_OOBINLINE) { db_printf("%sSO_OOBINLINE", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEPORT) { db_printf("%sSO_REUSEPORT", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEPORT_LB) { db_printf("%sSO_REUSEPORT_LB", comma ? ", " : ""); comma = 1; } if (so_options & SO_TIMESTAMP) { db_printf("%sSO_TIMESTAMP", comma ? ", " : ""); comma = 1; } if (so_options & SO_NOSIGPIPE) { db_printf("%sSO_NOSIGPIPE", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTFILTER) { db_printf("%sSO_ACCEPTFILTER", comma ? ", " : ""); comma = 1; } if (so_options & SO_BINTIME) { db_printf("%sSO_BINTIME", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_OFFLOAD) { db_printf("%sSO_NO_OFFLOAD", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_DDP) { db_printf("%sSO_NO_DDP", comma ? ", " : ""); comma = 1; } } static void db_print_sostate(short so_state) { int comma; comma = 0; if (so_state & SS_ISCONNECTED) { db_printf("%sSS_ISCONNECTED", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONNECTING) { db_printf("%sSS_ISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISDISCONNECTING) { db_printf("%sSS_ISDISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_NBIO) { db_printf("%sSS_NBIO", comma ? ", " : ""); comma = 1; } if (so_state & SS_ASYNC) { db_printf("%sSS_ASYNC", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONFIRMING) { db_printf("%sSS_ISCONFIRMING", comma ? ", " : ""); comma = 1; } } static void db_print_soqstate(int so_qstate) { int comma; comma = 0; if (so_qstate & SQ_INCOMP) { db_printf("%sSQ_INCOMP", comma ? ", " : ""); comma = 1; } if (so_qstate & SQ_COMP) { db_printf("%sSQ_COMP", comma ? ", " : ""); comma = 1; } } static void db_print_sbstate(short sb_state) { int comma; comma = 0; if (sb_state & SBS_CANTSENDMORE) { db_printf("%sSBS_CANTSENDMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_CANTRCVMORE) { db_printf("%sSBS_CANTRCVMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_RCVATMARK) { db_printf("%sSBS_RCVATMARK", comma ? ", " : ""); comma = 1; } } static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_domain(struct domain *d, const char *domain_name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", domain_name, d); indent += 2; db_print_indent(indent); db_printf("dom_family: %d ", d->dom_family); db_printf("dom_name: %s\n", d->dom_name); db_print_indent(indent); db_printf("dom_externalize: %p ", d->dom_externalize); db_printf("dom_dispose: %p\n", d->dom_dispose); db_print_indent(indent); db_printf("dom_protosw: %p ", d->dom_protosw); db_printf("dom_next: %p\n", d->dom_next); db_print_indent(indent); db_printf("dom_rtattach: %p ", d->dom_rtattach); db_print_indent(indent); db_printf("dom_ifattach: %p ", d->dom_ifattach); db_printf("dom_ifdetach: %p\n", d->dom_ifdetach); } static void db_print_prflags(short pr_flags) { int comma; comma = 0; if (pr_flags & PR_ATOMIC) { db_printf("%sPR_ATOMIC", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_ADDR) { db_printf("%sPR_ADDR", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_CONNREQUIRED) { db_printf("%sPR_CONNREQUIRED", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_WANTRCVD) { db_printf("%sPR_WANTRCVD", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_RIGHTS) { db_printf("%sPR_RIGHTS", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_IMPLOPCL) { db_printf("%sPR_IMPLOPCL", comma ? ", " : ""); comma = 1; } } static void db_print_protosw(struct protosw *pr, const char *prname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", prname, pr); indent += 2; db_print_indent(indent); db_printf("pr_type: %d ", pr->pr_type); db_printf("pr_domain: %p\n", pr->pr_domain); if (pr->pr_domain != NULL) db_print_domain(pr->pr_domain, "pr_domain", indent); db_print_indent(indent); db_printf("pr_protocol: %d\n", pr->pr_protocol); db_print_indent(indent); db_printf("pr_flags: %d (", pr->pr_flags); db_print_prflags(pr->pr_flags); db_printf(")\n"); db_print_indent(indent); db_printf("pr_ctloutput: %p ", pr->pr_ctloutput); - - db_print_indent(indent); - db_printf("pr_drain: %p\n", pr->pr_drain); } static void db_print_sbflags(short sb_flags) { int comma; comma = 0; if (sb_flags & SB_WAIT) { db_printf("%sSB_WAIT", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_SEL) { db_printf("%sSB_SEL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_ASYNC) { db_printf("%sSB_ASYNC", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_UPCALL) { db_printf("%sSB_UPCALL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_NOINTR) { db_printf("%sSB_NOINTR", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AIO) { db_printf("%sSB_AIO", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_KNOTE) { db_printf("%sSB_KNOTE", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AUTOSIZE) { db_printf("%sSB_AUTOSIZE", comma ? ", " : ""); comma = 1; } } static void db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", sockbufname, sb); indent += 2; db_print_indent(indent); db_printf("sb_state: 0x%x (", sb->sb_state); db_print_sbstate(sb->sb_state); db_printf(")\n"); db_print_indent(indent); db_printf("sb_mb: %p ", sb->sb_mb); db_printf("sb_mbtail: %p ", sb->sb_mbtail); db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord); db_print_indent(indent); db_printf("sb_sndptr: %p ", sb->sb_sndptr); db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff); db_print_indent(indent); db_printf("sb_acc: %u ", sb->sb_acc); db_printf("sb_ccc: %u ", sb->sb_ccc); db_printf("sb_hiwat: %u ", sb->sb_hiwat); db_printf("sb_mbcnt: %u ", sb->sb_mbcnt); db_printf("sb_mbmax: %u\n", sb->sb_mbmax); db_print_indent(indent); db_printf("sb_ctl: %u ", sb->sb_ctl); db_printf("sb_lowat: %d ", sb->sb_lowat); db_printf("sb_timeo: %jd\n", sb->sb_timeo); db_print_indent(indent); db_printf("sb_flags: 0x%x (", sb->sb_flags); db_print_sbflags(sb->sb_flags); db_printf(")\n"); db_print_indent(indent); db_printf("sb_aiojobq first: %p\n", TAILQ_FIRST(&sb->sb_aiojobq)); } static void db_print_socket(struct socket *so, const char *socketname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", socketname, so); indent += 2; db_print_indent(indent); db_printf("so_count: %d ", so->so_count); db_printf("so_type: %d (", so->so_type); db_print_sotype(so->so_type); db_printf(")\n"); db_print_indent(indent); db_printf("so_options: 0x%x (", so->so_options); db_print_sooptions(so->so_options); db_printf(")\n"); db_print_indent(indent); db_printf("so_linger: %d ", so->so_linger); db_printf("so_state: 0x%x (", so->so_state); db_print_sostate(so->so_state); db_printf(")\n"); db_print_indent(indent); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); if (so->so_proto != NULL) db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); if (so->so_options & SO_ACCEPTCONN) { db_printf("sol_incomp first: %p ", TAILQ_FIRST(&so->sol_incomp)); db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); db_printf("sol_qlen: %d ", so->sol_qlen); db_printf("sol_incqlen: %d ", so->sol_incqlen); db_printf("sol_qlimit: %d ", so->sol_qlimit); } else { db_printf("so_qstate: 0x%x (", so->so_qstate); db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_listen: %p ", so->so_listen); /* so_list skipped */ db_printf("so_timeo: %d ", so->so_timeo); db_printf("so_error: %d\n", so->so_error); db_print_indent(indent); db_printf("so_sigio: %p ", so->so_sigio); db_printf("so_oobmark: %lu\n", so->so_oobmark); db_print_sockbuf(&so->so_rcv, "so_rcv", indent); db_print_sockbuf(&so->so_snd, "so_snd", indent); } } DB_SHOW_COMMAND(socket, db_show_socket) { struct socket *so; if (!have_addr) { db_printf("usage: show socket \n"); return; } so = (struct socket *)addr; db_print_socket(so, "socket", 0); } DB_SHOW_COMMAND(sockbuf, db_show_sockbuf) { struct sockbuf *sb; if (!have_addr) { db_printf("usage: show sockbuf \n"); return; } sb = (struct sockbuf *)addr; db_print_sockbuf(sb, "sockbuf", 0); } DB_SHOW_COMMAND(protosw, db_show_protosw) { struct protosw *pr; if (!have_addr) { db_printf("usage: show protosw \n"); return; } pr = (struct protosw *)addr; db_print_protosw(pr, "protosw", 0); } DB_SHOW_COMMAND(domain, db_show_domain) { struct domain *d; if (!have_addr) { db_printf("usage: show protosw \n"); return; } d = (struct domain *)addr; db_print_domain(d, "domain", 0); } #endif diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index 20e7c87a6c20..c6a79d34beb2 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -1,445 +1,444 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization * * Note: domain initialization takes place on a per domain basis * as a result of traversing a SYSINIT linker set. Most likely, * each domain would want to call DOMAIN_SET(9) itself, which * would cause the domain to be added just after domaininit() * is called during startup. * * See DOMAIN_SET(9) for details on its use. */ static void domaininit(void *); SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL); static void domainfinalize(void *); SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL); struct domain *domains; /* registered protocol domains */ int domain_init_status = 0; static struct mtx dom_mtx; /* domain list lock */ MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF); /* * Dummy protocol specific user requests function pointer array. * All functions return EOPNOTSUPP. */ struct pr_usrreqs nousrreqs = { .pru_accept = pru_accept_notsupp, .pru_attach = pru_attach_notsupp, .pru_bind = pru_bind_notsupp, .pru_connect = pru_connect_notsupp, .pru_connect2 = pru_connect2_notsupp, .pru_control = pru_control_notsupp, .pru_disconnect = pru_disconnect_notsupp, .pru_listen = pru_listen_notsupp, .pru_peeraddr = pru_peeraddr_notsupp, .pru_rcvd = pru_rcvd_notsupp, .pru_rcvoob = pru_rcvoob_notsupp, .pru_send = pru_send_notsupp, .pru_sense = pru_sense_null, .pru_shutdown = pru_shutdown_notsupp, .pru_sockaddr = pru_sockaddr_notsupp, .pru_sosend = pru_sosend_notsupp, .pru_soreceive = pru_soreceive_notsupp, .pru_sopoll = pru_sopoll_notsupp, }; static void pr_usrreqs_init(struct protosw *pr) { struct pr_usrreqs *pu; pu = pr->pr_usrreqs; KASSERT(pu != NULL, ("%s: %ssw[%d] has no usrreqs!", __func__, pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); /* * Protocol switch methods fall into three categories: mandatory, * mandatory but protosw_init() provides a default, and optional. * * For true protocols (i.e., pru_attach != NULL), KASSERT truly * mandatory methods with no defaults, and initialize defaults for * other mandatory methods if the protocol hasn't defined an * implementation (NULL function pointer). */ #if 0 if (pu->pru_attach != NULL) { KASSERT(pu->pru_abort != NULL, ("protosw_init: %ssw[%d] pru_abort NULL", pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); KASSERT(pu->pru_send != NULL, ("protosw_init: %ssw[%d] pru_send NULL", pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); } #endif #define DEFAULT(foo, bar) if ((foo) == NULL) (foo) = (bar) DEFAULT(pu->pru_accept, pru_accept_notsupp); DEFAULT(pu->pru_aio_queue, pru_aio_queue_notsupp); DEFAULT(pu->pru_bind, pru_bind_notsupp); DEFAULT(pu->pru_bindat, pru_bindat_notsupp); DEFAULT(pu->pru_connect, pru_connect_notsupp); DEFAULT(pu->pru_connect2, pru_connect2_notsupp); DEFAULT(pu->pru_connectat, pru_connectat_notsupp); DEFAULT(pu->pru_control, pru_control_notsupp); DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp); DEFAULT(pu->pru_listen, pru_listen_notsupp); DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp); DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp); DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp); DEFAULT(pu->pru_sense, pru_sense_null); DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp); DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp); DEFAULT(pu->pru_sosend, sosend_generic); DEFAULT(pu->pru_soreceive, soreceive_generic); DEFAULT(pu->pru_sopoll, sopoll_generic); DEFAULT(pu->pru_ready, pru_ready_notsupp); #undef DEFAULT } /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. */ void domain_init(void *arg) { struct domain *dp = arg; struct protosw *pr; int flags; MPASS(IS_DEFAULT_VNET(curvnet)); flags = atomic_load_acq_int(&dp->dom_flags); if ((flags & DOMF_SUPPORTED) == 0) return; MPASS((flags & DOMF_INITED) == 0); for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { pr_usrreqs_init(pr); } /* * update global information about maximums */ max_hdr = max_linkhdr + max_protohdr; max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); atomic_set_rel_int(&dp->dom_flags, DOMF_INITED); } /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. */ void domain_add(void *data) { struct domain *dp; dp = (struct domain *)data; if (dp->dom_probe != NULL && (*dp->dom_probe)() != 0) return; atomic_set_rel_int(&dp->dom_flags, DOMF_SUPPORTED); mtx_lock(&dom_mtx); dp->dom_next = domains; domains = dp; KASSERT(domain_init_status >= 1, ("attempt to domain_add(%s) before domaininit()", dp->dom_name)); #ifndef INVARIANTS if (domain_init_status < 1) printf("WARNING: attempt to domain_add(%s) before " "domaininit()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); } void domain_remove(void *data) { struct domain *dp = (struct domain *)data; if ((dp->dom_flags & DOMF_UNLOADABLE) == 0) return; mtx_lock(&dom_mtx); if (domains == dp) { domains = dp->dom_next; } else { struct domain *curr; for (curr = domains; curr != NULL; curr = curr->dom_next) { if (curr->dom_next == dp) { curr->dom_next = dp->dom_next; break; } } } mtx_unlock(&dom_mtx); } /* ARGSUSED*/ static void domaininit(void *dummy) { if (max_linkhdr < 16) /* XXX */ max_linkhdr = 16; mtx_lock(&dom_mtx); KASSERT(domain_init_status == 0, ("domaininit called too late!")); domain_init_status = 1; mtx_unlock(&dom_mtx); } /* ARGSUSED*/ static void domainfinalize(void *dummy) { mtx_lock(&dom_mtx); KASSERT(domain_init_status == 1, ("domainfinalize called too late!")); domain_init_status = 2; mtx_unlock(&dom_mtx); } struct domain * pffinddomain(int family) { struct domain *dp; for (dp = domains; dp != NULL; dp = dp->dom_next) if (dp->dom_family == family) return (dp); return (NULL); } struct protosw * pffindtype(int family, int type) { struct domain *dp; struct protosw *pr; dp = pffinddomain(family); if (dp == NULL) return (NULL); for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_type && pr->pr_type == type) return (pr); return (NULL); } struct protosw * pffindproto(int family, int protocol, int type) { struct domain *dp; struct protosw *pr; struct protosw *maybe; maybe = NULL; if (family == 0) return (NULL); dp = pffinddomain(family); if (dp == NULL) return (NULL); for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) return (pr); if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && pr->pr_protocol == 0 && maybe == NULL) maybe = pr; } return (maybe); } /* * The caller must make sure that the new protocol is fully set up and ready to * accept requests before it is registered. */ int pf_proto_register(int family, struct protosw *npr) { struct domain *dp; struct protosw *pr, *fpr; /* Sanity checks. */ if (family == 0) return (EPFNOSUPPORT); if (npr->pr_type == 0) return (EPROTOTYPE); if (npr->pr_protocol == 0) return (EPROTONOSUPPORT); if (npr->pr_usrreqs == NULL) return (ENXIO); /* Try to find the specified domain based on the family. */ dp = pffinddomain(family); if (dp == NULL) return (EPFNOSUPPORT); /* Initialize backpointer to struct domain. */ npr->pr_domain = dp; fpr = NULL; /* * Protect us against races when two protocol registrations for * the same protocol happen at the same time. */ mtx_lock(&dom_mtx); /* The new protocol must not yet exist. */ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_type == npr->pr_type) && (pr->pr_protocol == npr->pr_protocol)) { mtx_unlock(&dom_mtx); return (EEXIST); /* XXX: Check only protocol? */ } /* While here, remember the first free spacer. */ if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER)) fpr = pr; } /* If no free spacer is found we can't add the new protocol. */ if (fpr == NULL) { mtx_unlock(&dom_mtx); return (ENOMEM); } /* Copy the new struct protosw over the spacer. */ bcopy(npr, fpr, sizeof(*fpr)); pr_usrreqs_init(fpr); /* Job is done, no more protection required. */ mtx_unlock(&dom_mtx); return (0); } /* * The caller must make sure the protocol and its functions correctly shut down * all sockets and release all locks and memory references. */ int pf_proto_unregister(int family, int protocol, int type) { struct domain *dp; struct protosw *pr, *dpr; /* Sanity checks. */ if (family == 0) return (EPFNOSUPPORT); if (protocol == 0) return (EPROTONOSUPPORT); if (type == 0) return (EPROTOTYPE); /* Try to find the specified domain based on the family type. */ dp = pffinddomain(family); if (dp == NULL) return (EPFNOSUPPORT); dpr = NULL; /* Lock out everyone else while we are manipulating the protosw. */ mtx_lock(&dom_mtx); /* The protocol must exist and only once. */ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) { if (dpr != NULL) { mtx_unlock(&dom_mtx); return (EMLINK); /* Should not happen! */ } else dpr = pr; } } /* Protocol does not exist. */ if (dpr == NULL) { mtx_unlock(&dom_mtx); return (EPROTONOSUPPORT); } /* De-orbit the protocol and make the slot available again. */ dpr->pr_type = 0; dpr->pr_domain = dp; dpr->pr_protocol = PROTO_SPACER; dpr->pr_flags = 0; dpr->pr_ctloutput = NULL; - dpr->pr_drain = NULL; dpr->pr_usrreqs = &nousrreqs; /* Job is done, not more protection required. */ mtx_unlock(&dom_mtx); return (0); } diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 9b5f41976197..cac885560a30 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -1,314 +1,303 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mrouting.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include /* * While this file provides the domain and protocol switch tables for IPv4, it * also provides the sysctl node declarations for net.inet.* often shared with * IPv6 for common features or by upper layer protocols. In case of no IPv4 * support compile out everything but these sysctl nodes. */ #ifdef INET #include #include #include #include #endif /* INET */ #if defined(INET) || defined(INET6) #include #endif #ifdef INET #include #include #include #include #include #include #include #include #include #include #include #include /* * TCP/IP protocol family: IP, ICMP, UDP, TCP. */ static struct pr_usrreqs nousrreqs; #ifdef SCTP #include #include #include #include #endif FEATURE(inet, "Internet Protocol version 4"); extern struct domain inetdomain; /* Spacer for loadable protocols. */ #define IPPROTOSPACER \ { \ .pr_domain = &inetdomain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inetsw[] = { -{ - .pr_type = 0, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_IP, - .pr_flags = PR_CAPATTACH, - .pr_drain = ip_drain, - .pr_usrreqs = &nousrreqs -}, { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| PR_CAPATTACH, .pr_ctloutput = tcp_ctloutput, - .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ICMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IGMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RSVP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_MOBILE, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, # ifdef INET6 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #endif { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_usrreqs = &rip_usrreqs }, }; struct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[nitems(inetsw)], .dom_rtattach = in_inithead, #ifdef VIMAGE .dom_rtdetach = in_detachhead, #endif .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach }; DOMAIN_SET(inet); #endif /* INET */ SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet Family"); SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP"); SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP"); SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP"); SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP"); #endif SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IGMP"); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* XXX no protocol # to use, pick something "reserved" */ SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC"); SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "AH"); SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ESP"); SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "RAW"); SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Accept filters"); diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index ca451ef48649..e17d6fccb202 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1,1396 +1,1381 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #endif #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); -extern void ipreass_drain(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); /* * Respond with an ICMP host redirect when we forward a packet out of * the same interface on which it was received. See RFC 792. */ VNET_DEFINE(int, ipsendredirects) = 1; SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); VNET_DEFINE_STATIC(bool, ip_strong_es) = false; #define V_ip_strong_es VNET(ip_strong_es) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, rfc1122_strong_es, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_strong_es), false, "Packet's IP destination address must match address on arrival interface"); VNET_DEFINE_STATIC(bool, ip_sav) = true; #define V_ip_sav VNET(ip_sav) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, source_address_validation, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_sav), true, "Drop incoming packets with source address that is a local address"); VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif ipproto_input_t *ip_protox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip_input }; ipproto_ctlinput_t *ip_ctlprotox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip_ctlinput }; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ /* Make sure it is safe to use hashinit(9) on CK_LIST. */ CTASSERT(sizeof(struct in_ifaddrhashhead) == sizeof(LIST_HEAD(, in_addr))); #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ static void ip_vnet_init(void *arg __unused) { struct pfil_head_args args; CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP4; args.pa_headname = PFIL_INET_NAME; V_inet_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); #ifdef VIMAGE netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif #endif } VNET_SYSINIT(ip_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, ip_vnet_init, NULL); static void ip_init(const void *unused __unused) { /* * Register statically compiled protocols, that are unlikely to * ever become dynamic. */ IPPROTO_REGISTER(IPPROTO_ICMP, icmp_input, NULL); IPPROTO_REGISTER(IPPROTO_IGMP, igmp_input, NULL); IPPROTO_REGISTER(IPPROTO_RSVP, rsvp_input, NULL); IPPROTO_REGISTER(IPPROTO_IPV4, encap4_input, NULL); IPPROTO_REGISTER(IPPROTO_MOBILE, encap4_input, NULL); IPPROTO_REGISTER(IPPROTO_ETHERIP, encap4_input, NULL); IPPROTO_REGISTER(IPPROTO_GRE, encap4_input, NULL); IPPROTO_REGISTER(IPPROTO_IPV6, encap4_input, NULL); IPPROTO_REGISTER(IPPROTO_PIM, encap4_input, NULL); #ifdef SCTP /* XXX: has a loadable & static version */ IPPROTO_REGISTER(IPPROTO_SCTP, sctp_input, sctp_ctlinput); #endif netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } SYSINIT(ip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_init, NULL); #ifdef VIMAGE static void ip_destroy(void *unused __unused) { int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); pfil_head_unregister(V_inet_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ rib_flush_routes_family(AF_INET); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ IPSTAT_INC(ips_delivered); ip_protox[ip->ip_p](&m, &hlen, ip->ip_p); } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ bool strong_es; M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (__predict_false(m->m_pkthdr.len < sizeof(struct ip))) goto tooshort; if (m->m_len < sizeof(struct ip)) { m = m_pullup(m, sizeof(struct ip)); if (__predict_false(m == NULL)) { IPSTAT_INC(ips_toosmall); return; } } ip = mtod(m, struct ip *); if (__predict_false(ip->ip_v != IPVERSION)) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (__predict_false(hlen < sizeof(struct ip))) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { m = m_pullup(m, hlen); if (__predict_false(m == NULL)) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* IN_LOOPBACK must not appear on the wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (__predict_false(sum)) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (__predict_false(ip_len < hlen)) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (__predict_false(m->m_pkthdr.len < ip_len)) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Try to forward the packet, but if we fail continue. * ip_tryforward() may generate redirects these days. * XXX the logic below falling through to normal processing * if redirects are required should be revisited as well. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ if (V_ipforwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { /* * ip_dooptions() was run so we can ignore the source route (or * any IP options case) case for redirects in ip_tryforward(). */ if ((m = ip_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); goto ours; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != PFIL_PASS) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (ip->ip_p == IPPROTO_RSVP && V_rsvp_on) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) with a list of additional predicates: * - if IP forwarding is disabled * - the packet is not locally generated * - the packet is not subject to 'ipfw fwd' * - Interface is not running CARP. If the packet got here, we already * checked it with carp_iamatch() and carp_forus(). */ strong_es = V_ip_strong_es && (V_ipforwarding == 0) && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ CK_LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr != ip->ip_dst.s_addr) continue; /* * net.inet.ip.rfc1122_strong_es: the address matches, verify * that the packet arrived via the correct interface. */ if (__predict_false(strong_es && ia->ia_ifp != ifp)) { IPSTAT_INC(ips_badaddr); goto bad; } /* * net.inet.ip.source_address_validation: drop incoming * packets that pretend to be ours. */ if (V_ip_sav && !(ifp->if_flags & IFF_LOOPBACK) && __predict_false(in_localip_fib(ip->ip_src, ifp->if_fib))) { IPSTAT_INC(ips_badaddr); goto bad; } counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); goto ours; } /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp->if_flags & IFF_BROADCAST) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); goto ours; } #endif } ia = NULL; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * RFC 3927 2.7: Do not forward multicast packets from * IN_LINKLOCAL. */ if (V_ip_mrouter && !IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) { goto ours; } IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* RFC 3927 2.7: Do not forward packets to or from IN_LINKLOCAL. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); ip_protox[ip->ip_p](&m, &hlen, ip->ip_p); return; bad: m_freem(m); } -void -ip_drain(void) -{ - VNET_ITERATOR_DECL(vnet_iter); - - VNET_LIST_RLOCK_NOSLEEP(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - ipreass_drain(); - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK_NOSLEEP(); -} - int ipproto_register(uint8_t proto, ipproto_input_t input, ipproto_ctlinput_t ctl) { MPASS(proto > 0); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to rip_input() is unused. */ if (ip_protox[proto] == rip_input) { ip_protox[proto] = input; ip_ctlprotox[proto] = ctl; return (0); } else return (EEXIST); } int ipproto_unregister(uint8_t proto) { MPASS(proto > 0); if (ip_protox[proto] != rip_input) { ip_protox[proto] = rip_input; ip_ctlprotox[proto] = rip_ctlinput; return (0); } else return (ENOENT); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; uint32_t flowid; int error, type = 0, code = 0, mtu = 0; NET_EPOCH_ASSERT(); if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if ( #ifdef IPSTEALTH V_ipstealth == 0 && #endif ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; flowid = m->m_pkthdr.flowid; ro.ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_REF, flowid); if (ro.ro_nh != NULL) { ia = ifatoia(ro.ro_nh->nh_ifa); } else ia = NULL; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (V_ipstealth == 0) #endif ip->ip_ttl -= IPTTLDEC; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { /* mbuf consumed by IPsec */ RO_NHFREE(&ro); m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); return; } /* No IPsec processing required */ } #endif /* IPSEC */ /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct nhop_object *nh; nh = ro.ro_nh; if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) { struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa); u_long src = ntohl(ip->ip_src.s_addr); if (nh_ia != NULL && (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) { /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; if (nh->nh_flags & NHF_GATEWAY) { if (nh->gw_sa.sa_family == AF_INET) dest.s_addr = nh->gw4_sa.sin_addr.s_addr; else /* Do not redirect in case gw is AF_INET6 */ type = 0; } else dest.s_addr = ip->ip_dst.s_addr; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_nh) mtu = ro.ro_nh->nh_mtu; RO_NHFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); return; } } if (mcopy == NULL) return; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); return; } icmp_error(mcopy, type, code, dest.s_addr, mtu); } #define CHECK_SO_CT(sp, ct) \ (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { bool stamped; stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { struct bintime boottimebin, bt; struct timespec ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt); getboottimebin(&boottimebin); bintime_add(&bt, &boottimebin); } else { bintime(&bt); } *mp = sbcreatecontrol(&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { struct bintime boottimebin, bt1; struct timespec ts1; struct timeval tv; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &tv); } else { microtime(&tv); } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { struct bintime boottimebin; struct timespec ts, ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&ts, &ts1, &ts); } else { nanotime(&ts); } *mp = sbcreatecontrol(&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &ts); else nanouptime(&ts); *mp = sbcreatecontrol(&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol(&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol(&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol(&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol(opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol(ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif)) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol(sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol(&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol(&flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol(&flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol(&rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ VNET_DEFINE_STATIC(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c index b436d6282206..a0a8dd42b758 100644 --- a/sys/netinet/ip_reass.c +++ b/sys/netinet/ip_reass.c @@ -1,880 +1,890 @@ /*- * Copyright (c) 2015 Gleb Smirnoff * Copyright (c) 2015 Adrian Chadd * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ #define IPREASS_NHASH_LOG2 10 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; int count; }; VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]); #define V_ipq VNET(ipq) VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); #define V_ipq_hashseed VNET(ipq_hashseed) #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) void ipreass_init(void); -void ipreass_drain(void); #ifdef VIMAGE void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); ipq_free(bucket, fp); } static inline void ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ipq_free(bucket, fp); } /* * By default, limit the number of IP fragments across all reassembly * queues to 1/32 of the total number of mbuf clusters. * * Limit the total number of reassembly queues per VNET to the * IP fragment limit, but ensure the limit will not allow any bucket * to grow above 100 items. (The bucket limit is * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct * multiplier to reach a 100-item limit.) * The 100-item limit was chosen as brief testing seems to show that * this produces "reasonable" performance on some subset of systems * under DoS attack. */ #define IP_MAXFRAGS (nmbclusters / 32) #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) static int maxfrags; static u_int __exclusive_cache_line nfrags; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, &maxfrags, 0, "Maximum number of IPv4 fragments allowed across all reassembly queues"); SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, &nfrags, 0, "Current number of IPv4 fragments across all reassembly queues"); VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #define V_ipq_zone VNET(ipq_zone) SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_maxfragpackets, "I", "Maximum number of IPv4 fragment reassembly queue entries"); SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); VNET_DEFINE_STATIC(int, noreass); #define V_noreass VNET(noreass) VNET_DEFINE_STATIC(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxfragbucketsize, "I", "Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ #define M_IP_FRAG M_PROTO9 struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ifnet *srcifp; struct ipqhead *head; int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. * Also, drop packet if it would exceed the maximum * number of fragments. */ tmpmax = maxfrags; if (V_noreass == 1 || V_maxfragsperpacket == 0 || (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes, unless * this is the last fragment. */ if (ip->ip_len == htons(0) || ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { IPSTAT_INC(ips_toosmall); /* XXX */ IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } if (ip->ip_off & htons(IP_MF)) m->m_flags |= M_IP_FRAG; else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); /* * Make sure the fragment lies within a packet of valid size. */ if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } /* * Store receive network interface pointer for later. */ srcifp = m->m_pkthdr.rcvif; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ IPSTAT_INC(ips_fragments); m->m_pkthdr.PH_loc.ptr = ip; /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; hashkey[0] = ip->ip_src.s_addr; hashkey[1] = ip->ip_dst.s_addr; hashkey[2] = (uint32_t)ip->ip_p << 16; hashkey[2] += ip->ip_id; hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); hash &= IPREASS_HMASK; head = &V_ipq[hash].head; IPQ_LOCK(hash); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) break; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { if (V_ipq[hash].count < V_ipreass_maxbucketsize) fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(&nfrags, 1); fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; if (m->m_flags & M_IP_FRAG) fp->ipq_maxoff = -1; else fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); m->m_nextpkt = NULL; goto done; } else { /* * If we already saw the last fragment, make sure * this fragment's offset looks sane. Otherwise, if * this is the last fragment, record its endpoint. */ if (fp->ipq_maxoff > 0) { i = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || ((m->m_flags & M_IP_FRAG) == 0 && i != fp->ipq_maxoff)) { fp = NULL; goto dropfrag; } } else if ((m->m_flags & M_IP_FRAG) == 0) fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); fp->ipq_nfrags++; atomic_add_int(&nfrags, 1); #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the preceding * segment, then it's checksum is invalidated. */ if (p) { i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > ntohs(GETIP(q)->ip_off); q = nq) { i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - ntohs(GETIP(q)->ip_off); if (i < ntohs(GETIP(q)->ip_len)) { GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); ipq_drop(&V_ipq[hash], fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_demote_pkthdr(q); m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); atomic_subtract_int(&nfrags, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ m_fixhdr(m); /* set valid receive interface pointer */ m->m_pkthdr.rcvif = srcifp; } IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(hash); #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the * mbuf payload. * * For now, just assume we have to calculate a new one. * Later on we should check to see if the assigned flowid matches * what RSS wants for the given IP protocol and if so, just keep it. * * We then queue into the relevant netisr so it can be dispatched * to the correct CPU. * * Note - this may return 1, which means the flowid in the mbuf * is correct for the configured RSS hash types and can be used. */ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { m->m_pkthdr.flowid = rss_hash; M_HASHTYPE_SET(m, rss_type); } /* * Queue/dispatch for reprocessing. * * Note: this is much slower than just handling the frame in the * current receive context. It's likely worth investigating * why this is. */ netisr_dispatch(NETISR_IP_DIRECT, m); return (NULL); #endif /* Handle in-line */ return (m); dropfrag: IPSTAT_INC(ips_fragdropped); if (fp != NULL) { fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); } m_freem(m); done: IPQ_UNLOCK(hash); return (NULL); #undef GETIP } /* * If a timer expires on a reassembly queue, discard it. */ static struct callout ipreass_callout; static void ipreass_slowtimo(void *arg __unused) { VNET_ITERATOR_DECL(vnet_iter); struct ipq *fp, *tmp; if (atomic_load_int(&nfrags) == 0) return; VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (int i = 0; i < IPREASS_NHASH; i++) { if (TAILQ_EMPTY(&V_ipq[i].head)) continue; IPQ_LOCK(i); TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) if (--fp->ipq_ttl == 0) ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10, ipreass_slowtimo, NULL, 0); } static void ipreass_timer_init(void *arg __unused) { callout_init(&ipreass_callout, 1); callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10, ipreass_slowtimo, NULL, 0); } SYSINIT(ipreass, SI_SUB_VNET_DONE, SI_ORDER_ANY, ipreass_timer_init, NULL); +/* + * Drain off all datagram fragments. + */ +static void +ipreass_drain(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + while(!TAILQ_EMPTY(&V_ipq[i].head)) + ipq_drop(&V_ipq[i], + TAILQ_FIRST(&V_ipq[i].head)); + KASSERT(V_ipq[i].count == 0, + ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, + V_ipq[i].count, V_ipq)); + IPQ_UNLOCK(i); + } + CURVNET_RESTORE(); + } +} + + /* * Initialize IP reassembly structures. */ void ipreass_init(void) { int max; for (int i = 0; i < IPREASS_NHASH; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK); V_ipq[i].count = 0; } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); max = IP_MAXFRAGPACKETS; max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); if (IS_DEFAULT_VNET(curvnet)) { maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); - } -} - -/* - * Drain off all datagram fragments. - */ -void -ipreass_drain(void) -{ - - for (int i = 0; i < IPREASS_NHASH; i++) { - IPQ_LOCK(i); - while(!TAILQ_EMPTY(&V_ipq[i].head)) - ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head)); - KASSERT(V_ipq[i].count == 0, - ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, - V_ipq[i].count, V_ipq)); - IPQ_UNLOCK(i); + EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, + LOWMEM_PRI_DEFAULT); + EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, + LOWMEM_PRI_DEFAULT); } } /* * Drain off all datagram fragments belonging to * the given network interface. */ static void ipreass_cleanup(void *arg __unused, struct ifnet *ifp) { struct ipq *fp, *temp; struct mbuf *m; int i; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); CURVNET_SET_QUIET(ifp->if_vnet); /* * Skip processing if IPv4 reassembly is not initialised or * torn down by ipreass_destroy(). */ if (V_ipq_zone == NULL) { CURVNET_RESTORE(); return; } for (i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); /* Scan fragment list. */ TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { /* clear no longer valid rcvif pointer */ if (m->m_pkthdr.rcvif == ifp) m->m_pkthdr.rcvif = NULL; } } IPQ_UNLOCK(i); } CURVNET_RESTORE(); } EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); #ifdef VIMAGE /* * Destroy IP reassembly structures. */ void ipreass_destroy(void) { ipreass_drain(); uma_zdestroy(V_ipq_zone); V_ipq_zone = NULL; for (int i = 0; i < IPREASS_NHASH; i++) mtx_destroy(&V_ipq[i].lock); } #endif /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void ipreass_drain_tomax(void) { struct ipq *fp; int target; /* * Make sure each bucket is under the new limit. If * necessary, drop enough of the oldest elements from * each bucket to get under the new limit. */ for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); while (V_ipq[i].count > V_ipreass_maxbucketsize && (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every * run we strip the oldest element from each bucket. */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } } static void ipreass_zone_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); int max; maxfrags = IP_MAXFRAGS; max = IP_MAXFRAGPACKETS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Change the limit on the UMA zone, or disable the fragment allocation * at all. Since 0 and -1 is a special values here, we need our own handler, * instead of sysctl_handle_uma_zone_max(). */ static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) { int error, max; if (V_noreass == 0) { max = uma_zone_get_max(V_ipq_zone); if (max == 0) max = -1; } else max = 0; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max > 0) { /* * XXXRW: Might be a good idea to sanity check the argument * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { V_noreass = 1; ipreass_drain(); } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); } /* * Seek for old fragment queue header that can be reused. Try to * reuse a header from currently locked hash bucket. */ static struct ipq * ipq_reuse(int start) { struct ipq *fp; int bucket, i; IPQ_LOCK_ASSERT(start); for (i = 0; i < IPREASS_NHASH; i++) { bucket = (start + i) % IPREASS_NHASH; if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { m = fp->ipq_frags; fp->ipq_frags = m->m_nextpkt; m_freem(m); } TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); V_ipq[bucket].count--; if (bucket != start) IPQ_UNLOCK(bucket); break; } if (bucket != start) IPQ_UNLOCK(bucket); } IPQ_LOCK_ASSERT(start); return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(&bucket->head, fp, ipq_list); bucket->count--; uma_zfree(V_ipq_zone, fp); } /* * Get or set the maximum number of reassembly queues per bucket. */ static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) { int error, max; max = V_ipreass_maxbucketsize; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max <= 0) return (EINVAL); V_ipreass_maxbucketsize = max; ipreass_drain_tomax(); return (0); } diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 8711e0291379..7701b64c1be0 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -1,302 +1,301 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IP_VAR_H_ #define _NETINET_IP_VAR_H_ #include #include #include #include /* * Overlay for ip header used by other protocols (tcp, udp). */ struct ipovly { u_char ih_x1[9]; /* (unused) */ u_char ih_pr; /* protocol */ u_short ih_len; /* protocol length */ struct in_addr ih_src; /* source internet address */ struct in_addr ih_dst; /* destination internet address */ }; #ifdef _KERNEL /* * Ip reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. * They are timed out after ipq_ttl drops to 0, and may also * be reclaimed if memory becomes tight. */ struct ipq { TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ u_char ipq_ttl; /* time for reass q to live */ u_char ipq_p; /* protocol of this fragment */ u_short ipq_id; /* sequence id for reassembly */ int ipq_maxoff; /* total length of packet */ struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; u_char ipq_nfrags; /* # frags in this packet */ struct label *ipq_label; /* MAC label */ }; #endif /* _KERNEL */ /* * Structure stored in mbuf in inpcb.ip_options * and passed to ip_output when ip options are in use. * The actual length of the options (including ipopt_dst) * is in m_len. */ #define MAX_IPOPTLEN 40 struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ char ipopt_list[MAX_IPOPTLEN]; /* options proper */ }; #if defined(_NETINET_IN_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.ip_moptions and * passed to ip_output when IP multicast options are in use. * This structure is lazy-allocated. */ struct ip_moptions { struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ u_long imo_multicast_vif; /* vif num outgoing multicasts */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ struct ip_mfilter_head imo_head; /* group membership list */ }; #else struct ip_moptions; #endif struct ipstat { uint64_t ips_total; /* total packets received */ uint64_t ips_badsum; /* checksum bad */ uint64_t ips_tooshort; /* packet too short */ uint64_t ips_toosmall; /* not enough data */ uint64_t ips_badhlen; /* ip header length < data size */ uint64_t ips_badlen; /* ip length < ip header length */ uint64_t ips_fragments; /* fragments received */ uint64_t ips_fragdropped; /* frags dropped (dups, out of space) */ uint64_t ips_fragtimeout; /* fragments timed out */ uint64_t ips_forward; /* packets forwarded */ uint64_t ips_fastforward; /* packets fast forwarded */ uint64_t ips_cantforward; /* packets rcvd for unreachable dest */ uint64_t ips_redirectsent; /* packets forwarded on same net */ uint64_t ips_noproto; /* unknown or unsupported protocol */ uint64_t ips_delivered; /* datagrams delivered to upper level*/ uint64_t ips_localout; /* total ip packets generated here */ uint64_t ips_odropped; /* lost packets due to nobufs, etc. */ uint64_t ips_reassembled; /* total packets reassembled ok */ uint64_t ips_fragmented; /* datagrams successfully fragmented */ uint64_t ips_ofragments; /* output fragments created */ uint64_t ips_cantfrag; /* don't fragment flag was set, etc. */ uint64_t ips_badoptions; /* error in option processing */ uint64_t ips_noroute; /* packets discarded due to no route */ uint64_t ips_badvers; /* ip version != 4 */ uint64_t ips_rawout; /* total raw ip packets generated */ uint64_t ips_toolong; /* ip length > max ip packet size */ uint64_t ips_notmember; /* multicasts for unregistered grps */ uint64_t ips_nogif; /* no match gif found */ uint64_t ips_badaddr; /* invalid address on header */ }; #ifdef _KERNEL #include #include VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define IPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val)) #define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val)) #define IPSTAT_INC(name) IPSTAT_ADD(name, 1) #define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_ipstat_inc(int statnum); #define KMOD_IPSTAT_INC(name) \ kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t)) void kmod_ipstat_dec(int statnum); #define KMOD_IPSTAT_DEC(name) \ kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t)) /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ #define IP_SENDONES 0x4 /* send all-ones broadcast */ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ #define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #define IP_NO_SND_TAG_RL 0x80 /* Don't send down the ratelimit tag */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 #else #define IP_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif struct ip; struct inpcb; struct route; struct sockopt; struct inpcbinfo; VNET_DECLARE(int, ip_defttl); /* default IP ttl */ VNET_DECLARE(int, ipforwarding); /* ip forwarding */ VNET_DECLARE(int, ipsendredirects); #ifdef IPSTEALTH VNET_DECLARE(int, ipstealth); /* stealth forwarding */ #endif VNET_DECLARE(struct socket *, ip_rsvpd); /* reservation protocol daemon*/ VNET_DECLARE(struct socket *, ip_mrouter); /* multicast routing daemon */ extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); VNET_DECLARE(int, rsvp_on); VNET_DECLARE(int, drop_redirect); extern struct pr_usrreqs rip_usrreqs; #define V_ip_id VNET(ip_id) #define V_ip_defttl VNET(ip_defttl) #define V_ipforwarding VNET(ipforwarding) #define V_ipsendredirects VNET(ipsendredirects) #ifdef IPSTEALTH #define V_ipstealth VNET(ipstealth) #endif #define V_ip_rsvpd VNET(ip_rsvpd) #define V_ip_mrouter VNET(ip_mrouter) #define V_rsvp_on VNET(rsvp_on) #define V_drop_redirect VNET(drop_redirect) void inp_freemoptions(struct ip_moptions *); int inp_getmoptions(struct inpcb *, struct sockopt *); int inp_setmoptions(struct inpcb *, struct sockopt *); int ip_ctloutput(struct socket *, struct sockopt *sopt); -void ip_drain(void); int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags); void ip_forward(struct mbuf *m, int srcrt); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int ip_output(struct mbuf *, struct mbuf *, struct route *, int, struct ip_moptions *, struct inpcb *); struct mbuf * ip_reass(struct mbuf *); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_fillid(struct ip *); int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); int rip_input(struct mbuf **, int *, int); int ipip_input(struct mbuf **, int *, int); int rsvp_input(struct mbuf **, int *, int); int ip_rsvp_init(struct socket *); int ip_rsvp_done(void); extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *); extern void (*ip_rsvp_force_done)(struct socket *); extern int (*rsvp_input_p)(struct mbuf **, int *, int); VNET_DECLARE(struct pfil_head *, inet_pfil_head); #define V_inet_pfil_head VNET(inet_pfil_head) #define PFIL_INET_NAME "inet" void in_delayed_cksum(struct mbuf *m); /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */ /* * Reference to an ipfw or packet filter rule that can be carried * outside critical sections. * A rule is identified by rulenum:rule_id which is ordered. * In version chain_id the rule can be found in slot 'slot', so * we don't need a lookup if chain_id == chain->id. * * On exit from the firewall this structure refers to the rule after * the matching one (slot points to the new rule; rulenum:rule_id-1 * is the matching rule), and additional info (e.g. info often contains * the insn argument or tablearg in the low 16 bits, in host format). * On entry, the structure is valid if slot>0, and refers to the starting * rules. 'info' contains the reason for reinject, e.g. divert port, * divert direction, and so on. */ struct ipfw_rule_ref { uint32_t slot; /* slot for matching rule */ uint32_t rulenum; /* matching rule number */ uint32_t rule_id; /* matching rule id */ uint32_t chain_id; /* ruleset id */ uint32_t info; /* see below */ }; enum { IPFW_INFO_MASK = 0x0000ffff, IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ IPFW_IS_MASK = 0x30000000, /* which source ? */ IPFW_IS_DIVERT = 0x20000000, IPFW_IS_DUMMYNET =0x10000000, IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ }; #define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ #define MTAG_IPFW_RULE 1262273568 /* rule reference */ #define MTAG_IPFW_CALL 1308397630 /* call stack */ struct ip_fw_args; typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args); typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *); VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) /* Divert hooks. */ extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming); /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); extern int (*ip_dn_ctl_ptr)(struct sockopt *); extern int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); #endif /* _KERNEL */ #endif /* !_NETINET_IP_VAR_H_ */ diff --git a/sys/netinet/sctp_module.c b/sys/netinet/sctp_module.c index ea49b74343e3..ba0d585bd541 100644 --- a/sys/netinet/sctp_module.c +++ b/sys/netinet/sctp_module.c @@ -1,187 +1,181 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019-2020 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET extern struct domain inetdomain; struct protosw sctp_stream_protosw = { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs, }; struct protosw sctp_seqpacket_protosw = { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs, }; #endif #ifdef INET6 extern struct domain inet6domain; struct protosw sctp6_stream_protosw = { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs, }; struct protosw sctp6_seqpacket_protosw = { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, -#ifndef INET /* Do not call initialization and drain routines twice. */ - .pr_drain = sctp_drain, -#endif .pr_usrreqs = &sctp6_usrreqs, }; #endif static int sctp_module_load(void) { int error; #ifdef INET error = pf_proto_register(PF_INET, &sctp_stream_protosw); if (error != 0) return (error); error = pf_proto_register(PF_INET, &sctp_seqpacket_protosw); if (error != 0) return (error); error = ipproto_register(IPPROTO_SCTP, sctp_input, sctp_ctlinput); if (error != 0) return (error); #endif #ifdef INET6 error = pf_proto_register(PF_INET6, &sctp6_stream_protosw); if (error != 0) return (error); error = pf_proto_register(PF_INET6, &sctp6_seqpacket_protosw); if (error != 0) return (error); error = ip6proto_register(IPPROTO_SCTP, sctp6_input, sctp6_ctlinput); if (error != 0) return (error); #endif error = sctp_syscalls_init(); if (error != 0) return (error); return (0); } static int __unused sctp_module_unload(void) { (void)sctp_syscalls_uninit(); #ifdef INET (void)ipproto_unregister(IPPROTO_SCTP); (void)pf_proto_unregister(PF_INET, IPPROTO_SCTP, SOCK_STREAM); (void)pf_proto_unregister(PF_INET, IPPROTO_SCTP, SOCK_SEQPACKET); #endif #ifdef INET6 (void)ip6proto_unregister(IPPROTO_SCTP); (void)pf_proto_unregister(PF_INET6, IPPROTO_SCTP, SOCK_STREAM); (void)pf_proto_unregister(PF_INET6, IPPROTO_SCTP, SOCK_SEQPACKET); #endif return (0); } static int sctp_modload(struct module *module, int cmd, void *arg) { int error; switch (cmd) { case MOD_LOAD: error = sctp_module_load(); break; case MOD_UNLOAD: /* * Unloading SCTP is currently unsupported. Currently, SCTP * iterator threads are not stopped during unload. */ error = EOPNOTSUPP; break; default: error = 0; break; } return (error); } static moduledata_t sctp_mod = { "sctp", &sctp_modload, NULL, }; DECLARE_MODULE(sctp, sctp_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_VERSION(sctp, 1); diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index 73c550b86d65..0fb92e7408f4 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -1,7084 +1,7091 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #ifdef INET6 #include #endif #include #include #include /* FIX: we don't handle multiple link local scopes */ /* "scopeless" replacement IN6_ARE_ADDR_EQUAL */ #ifdef INET6 int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b) { struct sockaddr_in6 tmp_a, tmp_b; memcpy(&tmp_a, a, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } memcpy(&tmp_b, b, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr)); } #endif void sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb) { /* * We really don't need to lock this, but I will just because it * does not hurt. */ SCTP_INP_INFO_RLOCK(); spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep); spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc); spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr); spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr); spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk); spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq); spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq); spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks); SCTP_INP_INFO_RUNLOCK(); } /*- * Addresses are added to VRF's (Virtual Router's). For BSD we * have only the default VRF 0. We maintain a hash list of * VRF's. Each VRF has its own list of sctp_ifn's. Each of * these has a list of addresses. When we add a new address * to a VRF we lookup the ifn/ifn_index, if the ifn does * not exist we create it and add it to the list of IFN's * within the VRF. Once we have the sctp_ifn, we add the * address to the list. So we look something like: * * hash-vrf-table * vrf-> ifn-> ifn -> ifn * vrf | * ... +--ifa-> ifa -> ifa * vrf * * We keep these separate lists since the SCTP subsystem will * point to these from its source address selection nets structure. * When an address is deleted it does not happen right away on * the SCTP side, it gets scheduled. What we do when a * delete happens is immediately remove the address from * the master list and decrement the refcount. As our * addip iterator works through and frees the src address * selection pointing to the sctp_ifa, eventually the refcount * will reach 0 and we will delete it. Note that it is assumed * that any locking on system level ifn/ifa is done at the * caller of these functions and these routines will only * lock the SCTP structures as they add or delete things. * * Other notes on VRF concepts. * - An endpoint can be in multiple VRF's * - An association lives within a VRF and only one VRF. * - Any incoming packet we can deduce the VRF for by * looking at the mbuf/pak inbound (for BSD its VRF=0 :D) * - Any downward send call or connect call must supply the * VRF via ancillary data or via some sort of set default * VRF socket option call (again for BSD no brainer since * the VRF is always 0). * - An endpoint may add multiple VRF's to it. * - Listening sockets can accept associations in any * of the VRF's they are in but the assoc will end up * in only one VRF (gotten from the packet or connect/send). * */ struct sctp_vrf * sctp_allocate_vrf(int vrf_id) { struct sctp_vrf *vrf = NULL; struct sctp_vrflist *bucket; /* First allocate the VRF structure */ vrf = sctp_find_vrf(vrf_id); if (vrf) { /* Already allocated */ return (vrf); } SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf), SCTP_M_VRF); if (vrf == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif return (NULL); } /* setup the VRF */ memset(vrf, 0, sizeof(struct sctp_vrf)); vrf->vrf_id = vrf_id; LIST_INIT(&vrf->ifnlist); vrf->total_ifa_count = 0; vrf->refcount = 0; /* now also setup table ids */ SCTP_INIT_VRF_TABLEID(vrf); /* Init the HASH of addresses */ vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE, &vrf->vrf_addr_hashmark); if (vrf->vrf_addr_hash == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif SCTP_FREE(vrf, SCTP_M_VRF); return (NULL); } /* Add it to the hash table */ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_INSERT_HEAD(bucket, vrf, next_vrf); atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); return (vrf); } struct sctp_ifn * sctp_find_ifn(void *ifn, uint32_t ifn_index) { struct sctp_ifn *sctp_ifnp; struct sctp_ifnlist *hash_ifn_head; /* * We assume the lock is held for the addresses if that's wrong * problems could occur :-) */ SCTP_IPI_ADDR_LOCK_ASSERT(); hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) { if (sctp_ifnp->ifn_index == ifn_index) { return (sctp_ifnp); } if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) { return (sctp_ifnp); } } return (NULL); } struct sctp_vrf * sctp_find_vrf(uint32_t vrf_id) { struct sctp_vrflist *bucket; struct sctp_vrf *liste; bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH(liste, bucket, next_vrf) { if (vrf_id == liste->vrf_id) { return (liste); } } return (NULL); } void sctp_free_vrf(struct sctp_vrf *vrf) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) { if (vrf->vrf_addr_hash) { SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); vrf->vrf_addr_hash = NULL; } /* We zero'd the count */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); } } void sctp_free_ifn(struct sctp_ifn *sctp_ifnp) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) { /* We zero'd the count */ if (sctp_ifnp->vrf) { sctp_free_vrf(sctp_ifnp->vrf); } SCTP_FREE(sctp_ifnp, SCTP_M_IFN); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); } } void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu) { struct sctp_ifn *sctp_ifnp; sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index); if (sctp_ifnp != NULL) { sctp_ifnp->ifn_mtu = mtu; } } void sctp_free_ifa(struct sctp_ifa *sctp_ifap) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) { /* We zero'd the count */ if (sctp_ifap->ifn_p) { sctp_free_ifn(sctp_ifap->ifn_p); } SCTP_FREE(sctp_ifap, SCTP_M_IFA); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); } } static void sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock) { struct sctp_ifn *found; found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index); if (found == NULL) { /* Not in the list.. sorry */ return; } if (hold_addr_lock == 0) { SCTP_IPI_ADDR_WLOCK(); } else { SCTP_IPI_ADDR_WLOCK_ASSERT(); } LIST_REMOVE(sctp_ifnp, next_bucket); LIST_REMOVE(sctp_ifnp, next_ifn); if (hold_addr_lock == 0) { SCTP_IPI_ADDR_WUNLOCK(); } /* Take away the reference, and possibly free it */ sctp_free_ifn(sctp_ifnp); } void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); goto out; } if (sctp_ifap->ifn_p == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", sctp_ifap->ifn_p->ifn_name, if_name); goto out; } } else { if (sctp_ifap->ifn_p->ifn_index != ifn_index) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", sctp_ifap->ifn_p->ifn_index, ifn_index); goto out; } } sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID); sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; out: SCTP_IPI_ADDR_RUNLOCK(); } void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); goto out; } if (sctp_ifap->ifn_p == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", sctp_ifap->ifn_p->ifn_name, if_name); goto out; } } else { if (sctp_ifap->ifn_p->ifn_index != ifn_index) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", sctp_ifap->ifn_p->ifn_index, ifn_index); goto out; } } sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE); sctp_ifap->localifa_flags |= SCTP_ADDR_VALID; out: SCTP_IPI_ADDR_RUNLOCK(); } /*- * Add an ifa to an ifn. * Register the interface as necessary. * NOTE: ADDR write lock MUST be held. */ static void sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) { int ifa_af; LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); /* update address counts */ sctp_ifnp->ifa_count++; ifa_af = sctp_ifap->address.sa.sa_family; switch (ifa_af) { #ifdef INET case AF_INET: sctp_ifnp->num_v4++; break; #endif #ifdef INET6 case AF_INET6: sctp_ifnp->num_v6++; break; #endif default: break; } if (sctp_ifnp->ifa_count == 1) { /* register the new interface */ sctp_ifnp->registered_af = ifa_af; } } /*- * Remove an ifa from its ifn. * If no more addresses exist, remove the ifn too. Otherwise, re-register * the interface based on the remaining address families left. * NOTE: ADDR write lock MUST be held. */ static void sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) { LIST_REMOVE(sctp_ifap, next_ifa); if (sctp_ifap->ifn_p) { /* update address counts */ sctp_ifap->ifn_p->ifa_count--; switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: sctp_ifap->ifn_p->num_v4--; break; #endif #ifdef INET6 case AF_INET6: sctp_ifap->ifn_p->num_v6--; break; #endif default: break; } if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) { /* remove the ifn, possibly freeing it */ sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED); } else { /* re-register address family type, if needed */ if ((sctp_ifap->ifn_p->num_v6 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET6)) { sctp_ifap->ifn_p->registered_af = AF_INET; } else if ((sctp_ifap->ifn_p->num_v4 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET)) { sctp_ifap->ifn_p->registered_af = AF_INET6; } /* free the ifn refcount */ sctp_free_ifn(sctp_ifap->ifn_p); } sctp_ifap->ifn_p = NULL; } } struct sctp_ifa * sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, uint32_t ifn_type, const char *if_name, void *ifa, struct sockaddr *addr, uint32_t ifa_flags, int dynamic_add) { struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifnp, *new_sctp_ifnp; struct sctp_ifa *sctp_ifap, *new_sctp_ifap; struct sctp_ifalist *hash_addr_head; struct sctp_ifnlist *hash_ifn_head; uint32_t hash_of_addr; int new_ifn_af = 0; #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif SCTP_MALLOC(new_sctp_ifnp, struct sctp_ifn *, sizeof(struct sctp_ifn), SCTP_M_IFN); if (new_sctp_ifnp == NULL) { #ifdef INVARIANTS panic("No memory for IFN"); #endif return (NULL); } SCTP_MALLOC(new_sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA); if (new_sctp_ifap == NULL) { #ifdef INVARIANTS panic("No memory for IFA"); #endif SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); return (NULL); } SCTP_IPI_ADDR_WLOCK(); sctp_ifnp = sctp_find_ifn(ifn, ifn_index); if (sctp_ifnp) { vrf = sctp_ifnp->vrf; } else { vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { vrf = sctp_allocate_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_WUNLOCK(); SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (NULL); } } } if (sctp_ifnp == NULL) { /* * build one and add it, can't hold lock until after malloc * done though. */ sctp_ifnp = new_sctp_ifnp; new_sctp_ifnp = NULL; memset(sctp_ifnp, 0, sizeof(struct sctp_ifn)); sctp_ifnp->ifn_index = ifn_index; sctp_ifnp->ifn_p = ifn; sctp_ifnp->ifn_type = ifn_type; sctp_ifnp->refcount = 0; sctp_ifnp->vrf = vrf; atomic_add_int(&vrf->refcount, 1); sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index); if (if_name != NULL) { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", if_name); } else { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", "unknown"); } hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_INIT(&sctp_ifnp->ifalist); LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket); LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn); atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); new_ifn_af = 1; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap) { /* Hmm, it already exists? */ if ((sctp_ifap->ifn_p) && (sctp_ifap->ifn_p->ifn_index == ifn_index)) { SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n", sctp_ifap->ifn_p->ifn_name, ifn_index, (void *)sctp_ifap); if (new_ifn_af) { /* Remove the created one that we don't want */ sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED); } if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { /* easy to solve, just switch back to active */ SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); sctp_ifap->localifa_flags = SCTP_ADDR_VALID; sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); } exit_stage_left: SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (sctp_ifap); } else { if (sctp_ifap->ifn_p) { /* * The last IFN gets the address, remove the * old one */ SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name, sctp_ifap->ifn_p->ifn_index, if_name, ifn_index); /* remove the address from the old ifn */ sctp_remove_ifa_from_ifn(sctp_ifap); /* move the address over to the new ifn */ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); goto exit_stage_left; } else { /* repair ifnp which was NULL ? */ sctp_ifap->localifa_flags = SCTP_ADDR_VALID; SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", (void *)sctp_ifnp, (void *)sctp_ifap); sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); } goto exit_stage_left; } } sctp_ifap = new_sctp_ifap; memset(sctp_ifap, 0, sizeof(struct sctp_ifa)); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifnp->refcount, 1); sctp_ifap->vrf_id = vrf_id; sctp_ifap->ifa = ifa; memcpy(&sctp_ifap->address, addr, addr->sa_len); sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; sctp_ifap->flags = ifa_flags; /* Set scope */ switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = &sctp_ifap->address.sin; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_loop = 1; } if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v4++; if (new_ifn_af) new_ifn_af = AF_INET; break; } #endif #ifdef INET6 case AF_INET6: { /* ok to use deprecated addresses? */ struct sockaddr_in6 *sin6; sin6 = &sctp_ifap->address.sin6; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) { sctp_ifap->src_is_loop = 1; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v6++; if (new_ifn_af) new_ifn_af = AF_INET6; break; } #endif default: new_ifn_af = 0; break; } hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa); if ((sctp_ifap->src_is_priv == 0) && (sctp_ifap->src_is_loop == 0)) { sctp_ifap->src_is_glob = 1; } hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket); sctp_ifap->refcount = 1; LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifnp->ifa_count++; vrf->total_ifa_count++; atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); if (new_ifn_af) { sctp_ifnp->registered_af = new_ifn_af; } SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } if (dynamic_add) { /* * Bump up the refcount so that when the timer completes it * will drop back down. */ struct sctp_laddr *wi; atomic_add_int(&sctp_ifap->refcount, 1); wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Opps, must decrement the count */ sctp_del_addr_from_vrf(vrf_id, addr, ifn_index, if_name); return (NULL); } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_ADD_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } else { /* it's ready for use */ sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } return (sctp_ifap); } void sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, uint32_t ifn_index, const char *if_name) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap = NULL; SCTP_IPI_ADDR_WLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out_now; } #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap) { /* Validate the delete */ if (sctp_ifap->ifn_p) { int valid = 0; /*- * The name has priority over the ifn_index * if its given. */ if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) == 0) { /* They match its a correct delete */ valid = 1; } } if (!valid) { /* last ditch check ifn_index */ if (ifn_index == sctp_ifap->ifn_p->ifn_index) { valid = 1; } } if (!valid) { SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n", ifn_index, ((if_name == NULL) ? "NULL" : if_name)); SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n", sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name); SCTP_IPI_ADDR_WUNLOCK(); return; } } SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap); sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; /* * We don't set the flag. This means that the structure will * hang around in EP's that have bound specific to it until * they close. This gives us TCP like behavior if someone * removes an address (or for that matter adds it right * back). */ /* sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; */ vrf->total_ifa_count--; LIST_REMOVE(sctp_ifap, next_bucket); sctp_remove_ifa_from_ifn(sctp_ifap); } #ifdef SCTP_DEBUG else { SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:", ifn_index); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); } #endif out_now: SCTP_IPI_ADDR_WUNLOCK(); if (sctp_ifap) { struct sctp_laddr *wi; wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Oops, must decrement the count */ sctp_free_ifa(sctp_ifap); return; } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_DEL_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); /* * Should this really be a tailq? As it is we will process * the newest first :-0 */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } return; } static int sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) { int loopback_scope; #if defined(INET) int ipv4_local_scope, ipv4_addr_legal; #endif #if defined(INET6) int local_scope, site_scope, ipv6_addr_legal; #endif struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; loopback_scope = stcb->asoc.scope.loopback_scope; #if defined(INET) ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; #endif #if defined(INET6) local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; #endif SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(stcb->asoc.vrf_id); if (vrf == NULL) { /* no vrf, no addresses */ SCTP_IPI_ADDR_RUNLOCK(); return (0); } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (sctp_is_addr_restricted(stcb, sctp_ifa) && (!sctp_is_addr_pending(stcb, sctp_ifa))) { /* * We allow pending addresses, where * we have sent an asconf-add to be * considered valid. */ continue; } if (sctp_ifa->address.sa.sa_family != to->sa_family) { continue; } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin, *rsin; sin = &sctp_ifa->address.sin; rsin = (struct sockaddr_in *)to; if ((ipv4_local_scope == 0) && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { continue; } if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6, *rsin6; sin6 = &sctp_ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; if (sin6->sin6_scope_id == 0) { if (sa6_recoverscope(sin6) != 0) continue; } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif default: /* TSNH */ break; } } } } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (sctp_is_addr_restricted(stcb, laddr->ifa) && (!sctp_is_addr_pending(stcb, laddr->ifa))) { /* * We allow pending addresses, where we have * sent an asconf-add to be considered * valid. */ continue; } if (laddr->ifa->address.sa.sa_family != to->sa_family) { continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = &laddr->ifa->address.sin; rsin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = &laddr->ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif default: /* TSNH */ break; } } } SCTP_IPI_ADDR_RUNLOCK(); return (0); } static struct sctp_tcb * sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id) { /**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */ /* * If we support the TCP model, then we must now dig through to see * if we can find our endpoint in the list of tcp ep's. */ uint16_t lport, rport; struct sctppcbhead *ephead; struct sctp_inpcb *inp; struct sctp_laddr *laddr; struct sctp_tcb *stcb; struct sctp_nets *net; if ((to == NULL) || (from == NULL)) { return (NULL); } switch (to->sa_family) { #ifdef INET case AF_INET: if (from->sa_family == AF_INET) { lport = ((struct sockaddr_in *)to)->sin_port; rport = ((struct sockaddr_in *)from)->sin_port; } else { return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (from->sa_family == AF_INET6) { lport = ((struct sockaddr_in6 *)to)->sin6_port; rport = ((struct sockaddr_in6 *)from)->sin6_port; } else { return (NULL); } break; #endif default: return (NULL); } ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; /* * Ok now for each of the guys in this bucket we must look and see: * - Does the remote port match. - Does there single association's * addresses match this address (to). If so we update p_ep to point * to this ep and return the tcb from it. */ LIST_FOREACH(inp, ephead, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if (lport != inp->sctp_lport) { SCTP_INP_RUNLOCK(inp); continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)to; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif default: SCTP_INP_RUNLOCK(inp); continue; } if (inp->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(inp); continue; } /* check to see if the ep has one of the addresses */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* We are NOT bound all, so look further */ int match = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == to->sa_family) { /* see if it matches */ #ifdef INET if (from->sa_family == AF_INET) { struct sockaddr_in *intf_addr, *sin; intf_addr = &laddr->ifa->address.sin; sin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == intf_addr->sin_addr.s_addr) { match = 1; break; } } #endif #ifdef INET6 if (from->sa_family == AF_INET6) { struct sockaddr_in6 *intf_addr6; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *) to; intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { match = 1; break; } } #endif } } if (match == 0) { /* This endpoint does not have this address */ SCTP_INP_RUNLOCK(inp); continue; } } /* * Ok if we hit here the ep has the address, does it hold * the tcb? */ /* XXX: Why don't we TAILQ_FOREACH through sctp_asoc_list? */ stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); continue; } SCTP_TCB_LOCK(stcb); if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } /* Does this TCB have a matching address? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._l_addr.sa.sa_family != from->sa_family) { /* not the same family, can't be a match */ continue; } switch (from->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *)&net->ro._l_addr; rsin = (struct sockaddr_in *)from; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)from; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); } return (NULL); } /* * rules for use * * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an * stcb, both will be locked (locked_tcb and stcb) but decrement will be done * (if locked == NULL). 3) Decrement happens on return ONLY if locked == * NULL. */ struct sctp_tcb * sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb) { struct sctpasochead *head; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; struct sctp_nets *net; uint16_t rport; inp = *inp_p; switch (remote->sa_family) { #ifdef INET case AF_INET: rport = (((struct sockaddr_in *)remote)->sin_port); break; #endif #ifdef INET6 case AF_INET6: rport = (((struct sockaddr_in6 *)remote)->sin6_port); break; #endif default: return (NULL); } if (locked_tcb) { /* * UN-lock so we can do proper locking here this occurs when * called from load_addresses_from_init. */ atomic_add_int(&locked_tcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(locked_tcb); } SCTP_INP_INFO_RLOCK(); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /*- * Now either this guy is our listener or it's the * connector. If it is the one that issued the connect, then * it's only chance is to be the first TCB in the list. If * it is the acceptor, then do the special_lookup to hash * and find the real inp. */ if ((inp->sctp_socket) && SCTP_IS_LISTENING(inp)) { /* to is peer addr, from is my addr */ stcb = sctp_tcb_special_locate(inp_p, remote, local, netp, inp->def_vrf_id); if ((stcb != NULL) && (locked_tcb == NULL)) { /* we have a locked tcb, lower refcount */ SCTP_INP_DECR_REF(inp); } if ((locked_tcb != NULL) && (locked_tcb != stcb)) { SCTP_INP_RLOCK(locked_tcb->sctp_ep); SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); SCTP_INP_RUNLOCK(locked_tcb->sctp_ep); } SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { goto null_return; } SCTP_TCB_LOCK(stcb); if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); goto null_return; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); goto null_return; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); goto null_return; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport, inp->sctp_hashmark)]; LIST_FOREACH(stcb, head, sctp_tcbhash) { if (stcb->rport != rport) { /* remote port does not match */ continue; } SCTP_TCB_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); continue; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *) &net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } null_return: /* clean up for returning null */ if (locked_tcb) { SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); /* not found */ return (NULL); } /* * Find an association for a specific endpoint using the association id given * out in the COMM_UP notification */ struct sctp_tcb * sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { /* * Use my the assoc_id to find a endpoint */ struct sctpasochead *head; struct sctp_tcb *stcb; uint32_t id; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_PRINTF("TSNH ep_associd0\n"); return (NULL); } id = (uint32_t)asoc_id; head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; if (head == NULL) { /* invalid id TSNH */ SCTP_PRINTF("TSNH ep_associd1\n"); return (NULL); } LIST_FOREACH(stcb, head, sctp_tcbasocidhash) { if (stcb->asoc.assoc_id == id) { if (inp != stcb->sctp_ep) { /* * some other guy has the same id active (id * collision ??). */ SCTP_PRINTF("TSNH ep_associd2\n"); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { continue; } if (want_lock) { SCTP_TCB_LOCK(stcb); } return (stcb); } } return (NULL); } struct sctp_tcb * sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { struct sctp_tcb *stcb; SCTP_INP_RLOCK(inp); stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock); SCTP_INP_RUNLOCK(inp); return (stcb); } /* * Endpoint probe expects that the INP_INFO is locked. */ static struct sctp_inpcb * sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head, uint16_t lport, uint32_t vrf_id) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sockaddr_in6 *intf_addr6; #endif int fnd; #ifdef INET sin = NULL; #endif #ifdef INET6 sin6 = NULL; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; break; #endif default: /* unsupported family */ return (NULL); } if (head == NULL) return (NULL); LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) && (inp->sctp_lport == lport)) { /* got it */ switch (nam->sa_family) { #ifdef INET case AF_INET: if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* * IPv4 on a IPv6 socket with ONLY * IPv6 set */ SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif #ifdef INET6 case AF_INET6: /* * A V6 address and the endpoint is NOT * bound V6 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif default: break; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; SCTP_INP_RUNLOCK(inp); if (!fnd) continue; return (inp); } SCTP_INP_RUNLOCK(inp); } switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == INADDR_ANY) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif default: break; } /* * ok, not bound to all so see if we can find a EP bound to this * address. */ LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) { SCTP_INP_RUNLOCK(inp); continue; } /* * Ok this could be a likely candidate, look at all of its * addresses */ if (inp->sctp_lport != lport) { SCTP_INP_RUNLOCK(inp); continue; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) { SCTP_INP_RUNLOCK(inp); continue; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ", (void *)laddr->ifa); if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == nam->sa_family) { /* possible, see if it matches */ switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == laddr->ifa->address.sin.sin_addr.s_addr) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif #ifdef INET6 case AF_INET6: intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif } } } SCTP_INP_RUNLOCK(inp); } return (NULL); } static struct sctp_inpcb * sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id) { struct sctppcbhead *head; struct sctp_inpcb *t_inp; int fnd; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; LIST_FOREACH(t_inp, head, sctp_hash) { if (t_inp->sctp_lport != lport) { continue; } /* is it in the VRF in question */ fnd = 0; if (t_inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) continue; /* This one is in use. */ /* check the v6/v4 binding issue */ if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(t_inp)) { if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* collision in V6 space */ return (t_inp); } else { /* inp is BOUND_V4 no conflict */ continue; } } else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* t_inp is bound v4 and v6, conflict always */ return (t_inp); } else { /* t_inp is bound only V4 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* no conflict */ continue; } /* else fall through to conflict */ } return (t_inp); } return (NULL); } int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) { /* For 1-2-1 with port reuse */ struct sctppcbhead *head; struct sctp_inpcb *tinp, *ninp; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* only works with port reuse on */ return (-1); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) { return (0); } SCTP_INP_WUNLOCK(inp); head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; /* Kick out all non-listeners to the TCP hash */ LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) { if (tinp->sctp_lport != inp->sctp_lport) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { continue; } if (SCTP_IS_LISTENING(tinp)) { continue; } SCTP_INP_WLOCK(tinp); LIST_REMOVE(tinp, sctp_hash); head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))]; tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; LIST_INSERT_HEAD(head, tinp, sctp_hash); SCTP_INP_WUNLOCK(tinp); } SCTP_INP_WLOCK(inp); /* Pull from where he was */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; LIST_INSERT_HEAD(head, inp, sctp_hash); return (0); } struct sctp_inpcb * sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock, uint32_t vrf_id) { /* * First we check the hash table to see if someone has this port * bound with just the port. */ struct sctp_inpcb *inp; struct sctppcbhead *head; int lport; unsigned int i; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; lport = sin->sin_port; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; lport = sin6->sin6_port; break; #endif default: return (NULL); } /* * I could cheat here and just cast to one of the types but we will * do it right. It also provides the check against an Unsupported * type too. */ /* Find the head of the ALLADDR chain */ if (have_lock == 0) { SCTP_INP_INFO_RLOCK(); } head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); /* * If the TCP model exists it could be that the main listening * endpoint is gone but there still exists a connected socket for * this guy. If so we can return the first one that we find. This * may NOT be the correct one so the caller should be wary on the * returned INP. Currently the only caller that sets find_tcp_pool * is in bindx where we are verifying that a user CAN bind the * address. He either has bound it already, or someone else has, or * its open to bind, so this is good enough. */ if (inp == NULL && find_tcp_pool) { for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) { head = &SCTP_BASE_INFO(sctp_tcpephash)[i]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); if (inp) { break; } } } if (inp) { SCTP_INP_INCR_REF(inp); } if (have_lock == 0) { SCTP_INP_INFO_RUNLOCK(); } return (inp); } /* * Find an association for an endpoint with the pointer to whom you want to * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may * need to change the *to to some other struct like a mbuf... */ struct sctp_tcb * sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to, struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool, uint32_t vrf_id) { struct sctp_inpcb *inp = NULL; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); if (find_tcp_pool) { if (inp_p != NULL) { stcb = sctp_tcb_special_locate(inp_p, from, to, netp, vrf_id); } else { stcb = sctp_tcb_special_locate(&inp, from, to, netp, vrf_id); } if (stcb != NULL) { SCTP_INP_INFO_RUNLOCK(); return (stcb); } } inp = sctp_pcb_findep(to, 0, 1, vrf_id); if (inp_p != NULL) { *inp_p = inp; } SCTP_INP_INFO_RUNLOCK(); if (inp == NULL) { return (NULL); } /* * ok, we have an endpoint, now lets find the assoc for it (if any) * we now place the source address or from in the to of the find * endpoint call. Since in reality this chain is used from the * inbound packet side. */ if (inp_p != NULL) { stcb = sctp_findassociation_ep_addr(inp_p, from, netp, to, NULL); } else { stcb = sctp_findassociation_ep_addr(&inp, from, netp, to, NULL); } return (stcb); } /* * This routine will grub through the mbuf that is a INIT or INIT-ACK and * find all addresses that the sender has specified in any address list. Each * address will be used to lookup the TCB and see if one exits. */ static struct sctp_tcb * sctp_findassociation_special_addr(struct mbuf *m, int offset, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, struct sockaddr *dst) { struct sctp_paramhdr *phdr, param_buf; #if defined(INET) || defined(INET6) struct sctp_tcb *stcb; uint16_t ptype; #endif uint16_t plen; #ifdef INET struct sockaddr_in sin4; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif #ifdef INET memset(&sin4, 0, sizeof(sin4)); sin4.sin_len = sizeof(sin4); sin4.sin_family = AF_INET; sin4.sin_port = sh->src_port; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; sin6.sin6_port = sh->src_port; #endif offset += sizeof(struct sctp_init_chunk); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr != NULL) { /* now we must see if we want the parameter */ #if defined(INET) || defined(INET6) ptype = ntohs(phdr->param_type); #endif plen = ntohs(phdr->param_length); if (plen == 0) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS && plen == sizeof(struct sctp_ipv4addr_param)) { /* Get the rest of the address */ struct sctp_ipv4addr_param ip4_param, *p4; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip4_param, sizeof(ip4_param)); if (phdr == NULL) { return (NULL); } p4 = (struct sctp_ipv4addr_param *)phdr; memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin4, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS && plen == sizeof(struct sctp_ipv6addr_param)) { /* Get the rest of the address */ struct sctp_ipv6addr_param ip6_param, *p6; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip6_param, sizeof(ip6_param)); if (phdr == NULL) { return (NULL); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin6, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif offset += SCTP_SIZE32(plen); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } return (NULL); } static struct sctp_tcb * sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport, uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag) { /* * Use my vtag to hash. If we find it we then verify the source addr * is in the assoc. If all goes well we save a bit on rec of a * packet. */ struct sctpasochead *head; struct sctp_nets *net; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { SCTP_INP_RLOCK(stcb->sctp_ep); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } if (stcb->sctp_ep->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } SCTP_TCB_LOCK(stcb); SCTP_INP_RUNLOCK(stcb->sctp_ep); if (stcb->asoc.my_vtag == vtag) { /* candidate */ if (stcb->rport != rport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->sctp_ep->sctp_lport != lport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } /* RRS:Need toaddr check here */ if (sctp_does_stcb_own_this_addr(stcb, to) == 0) { /* Endpoint does not own this address */ SCTP_TCB_UNLOCK(stcb); continue; } if (remote_tag) { /* * If we have both vtags that's all we match * on */ if (stcb->asoc.peer_vtag == remote_tag) { /* * If both tags match we consider it * conclusive and check NO * source/destination addresses */ goto conclusive; } } if (skip_src_check) { conclusive: if (from) { *netp = sctp_findnet(stcb, from); } else { *netp = NULL; /* unknown */ } if (inp_p) *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } net = sctp_findnet(stcb, from); if (net) { /* yep its him. */ *netp = net; SCTP_STAT_INCR(sctps_vtagexpress); *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { /* * not him, this should only happen in rare * cases so I peg it. */ SCTP_STAT_INCR(sctps_vtagbogus); } } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_INFO_RUNLOCK(); return (NULL); } /* * Find an association with the pointer to the inbound IP packet. This can be * a IPv4 or IPv6 packet. */ struct sctp_tcb * sctp_findassociation_addr(struct mbuf *m, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; struct sctp_inpcb *inp; if (sh->v_tag) { /* we only go down this path if vtag is non-zero */ stcb = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0); if (stcb) { return (stcb); } } if (inp_p) { stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp, 1, vrf_id); inp = *inp_p; } else { stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp, 1, vrf_id); } SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp); if (stcb == NULL && inp) { /* Found a EP but not this address */ if ((ch->chunk_type == SCTP_INITIATION) || (ch->chunk_type == SCTP_INITIATION_ACK)) { /*- * special hook, we do NOT return linp or an * association that is linked to an existing * association that is under the TCP pool (i.e. no * listener exists). The endpoint finding routine * will always find a listener before examining the * TCP pool. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) { if (inp_p) { *inp_p = NULL; } return (NULL); } stcb = sctp_findassociation_special_addr(m, offset, sh, &inp, netp, dst); if (inp_p != NULL) { *inp_p = inp; } } } SCTPDBG(SCTP_DEBUG_PCB1, "stcb is %p\n", (void *)stcb); return (stcb); } /* * lookup an association by an ASCONF lookup address. * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup */ struct sctp_tcb * sctp_findassociation_ep_asconf(struct mbuf *m, int offset, struct sockaddr *dst, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; union sctp_sockstore remote_store; struct sctp_paramhdr param_buf, *phdr; int ptype; int zero_address = 0; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif memset(&remote_store, 0, sizeof(remote_store)); phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), ¶m_buf, sizeof(struct sctp_paramhdr)); if (phdr == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n", __func__); return NULL; } ptype = (int)((uint32_t)ntohs(phdr->param_type)); /* get the correlation address */ switch (ptype) { #ifdef INET6 case SCTP_IPV6_ADDRESS: { /* ipv6 address param */ struct sctp_ipv6addr_param *p6, p6_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) { return NULL; } p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p6_buf.ph, sizeof(p6_buf)); if (p6 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n", __func__); return (NULL); } sin6 = &remote_store.sin6; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = sh->src_port; memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; break; } #endif #ifdef INET case SCTP_IPV4_ADDRESS: { /* ipv4 address param */ struct sctp_ipv4addr_param *p4, p4_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) { return NULL; } p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p4_buf.ph, sizeof(p4_buf)); if (p4 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n", __func__); return (NULL); } sin = &remote_store.sin; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = sh->src_port; memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr)); if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; break; } #endif default: /* invalid address param type */ return NULL; } if (zero_address) { stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 1, vrf_id, 0); if (stcb != NULL) { SCTP_INP_DECR_REF(*inp_p); } } else { stcb = sctp_findassociation_ep_addr(inp_p, &remote_store.sa, netp, dst, NULL); } return (stcb); } /* * allocate a sctp_inpcb and setup a temporary binding to a port/all * addresses. This way if we don't get a bind we by default pick a ephemeral * port with all addresses bound. */ int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) { /* * we get called when a new endpoint starts up. We need to allocate * the sctp_inpcb structure from the zone and init it. Mark it as * unbound and find a port that we can use as an ephemeral with * INADDR_ANY. If the user binds later no problem we can then add in * the specific addresses. And setup the default parameters for the * EP. */ int i, error; struct sctp_inpcb *inp; struct sctp_pcb *m; struct timeval time; sctp_sharedkey_t *null_key; error = 0; SCTP_INP_INFO_WLOCK(); inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb); if (inp == NULL) { SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n"); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); return (ENOBUFS); } /* zap it */ memset(inp, 0, sizeof(*inp)); /* bump generations */ /* setup socket pointers */ inp->sctp_socket = so; inp->ip_inp.inp.inp_socket = so; inp->ip_inp.inp.inp_cred = crhold(so->so_cred); #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { if (MODULE_GLOBAL(ip6_auto_flowlabel)) { inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL; } if (MODULE_GLOBAL(ip6_v6only)) { inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY; } } #endif inp->sctp_associd_counter = 1; inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT; inp->sctp_frag_point = 0; inp->max_cwnd = 0; inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off); inp->ecn_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_ecn_enable); inp->prsctp_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pr_enable); inp->auth_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_auth_enable); inp->asconf_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_asconf_enable); inp->reconfig_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_reconfig_enable); inp->nrsack_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_nrsack_enable); inp->pktdrop_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pktdrop_enable); inp->idata_supported = 0; inp->fibnum = so->so_fibnum; /* init the small hash table we use to track asocid <-> tcb */ inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark); if (inp->sctp_asocidhash == NULL) { crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_INP_INFO_WUNLOCK(); return (ENOBUFS); } SCTP_INCR_EP_COUNT(); inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); SCTP_INP_INFO_WUNLOCK(); so->so_pcb = (caddr_t)inp; if (SCTP_SO_TYPE(so) == SOCK_SEQPACKET) { /* UDP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure it is NON-BLOCKING IO for UDP */ /* SCTP_SET_SO_NBIO(so); */ } else if (SCTP_SO_TYPE(so) == SOCK_STREAM) { /* TCP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure we have blocking IO by default */ SOCK_LOCK(so); SCTP_CLEAR_SO_NBIO(so); SOCK_UNLOCK(so); } else { /* * unsupported socket type (RAW, etc)- in case we missed it * in protosw */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (EOPNOTSUPP); } if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize), &inp->sctp_hashmark); if (inp->sctp_tcbhash == NULL) { SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (ENOBUFS); } inp->def_vrf_id = vrf_id; SCTP_INP_INFO_WLOCK(); SCTP_INP_LOCK_INIT(inp); rw_init_flags(&inp->ip_inp.inp.inp_lock, "sctpinp", RW_RECURSE | RW_DUPOK); SCTP_INP_READ_INIT(inp); SCTP_ASOC_CREATE_LOCK_INIT(inp); /* lock the new ep */ SCTP_INP_WLOCK(inp); /* add it to the info area */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list); SCTP_INP_INFO_WUNLOCK(); TAILQ_INIT(&inp->read_queue); LIST_INIT(&inp->sctp_addr_list); LIST_INIT(&inp->sctp_asoc_list); #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_INIT(&inp->sctp_asoc_free_list); #endif /* Init the timer structure for signature change */ SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer); inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE; /* now init the actual endpoint default data */ m = &inp->sctp_ep; /* setup the base timeout information */ m->sctp_timeoutticks[SCTP_TIMER_SEND] = sctp_secs_to_ticks(SCTP_SEND_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_INIT] = sctp_secs_to_ticks(SCTP_INIT_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_RECV] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default)); m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default)); m->sctp_timeoutticks[SCTP_TIMER_PMTU] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default)); m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default)); m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default)); /* all max/min max are in ms */ m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default); m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default); m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); m->def_net_pf_threshold = SCTP_BASE_SYSCTL(sctp_path_pf_threshold); m->sctp_sws_sender = SCTP_SWS_SENDER_DEF; m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF; m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default); m->fr_max_burst = SCTP_BASE_SYSCTL(sctp_fr_max_burst_default); m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module); m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default); /* number of streams to pre-open on a association */ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); m->default_mtu = 0; /* Add adaptation cookie */ m->adaptation_layer_indicator = 0; m->adaptation_layer_indicator_provided = 0; /* seed random number generator */ m->random_counter = 1; m->store_at = SCTP_SIGNATURE_SIZE; SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers)); sctp_fill_random_store(m); /* Minimum cookie size */ m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) + sizeof(struct sctp_state_cookie); m->size_of_a_cookie += SCTP_SIGNATURE_SIZE; /* Setup the initial secret */ (void)SCTP_GETTIME_TIMEVAL(&time); m->time_of_secret_change = (unsigned int)time.tv_sec; for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { m->secret_key[0][i] = sctp_select_initial_TSN(m); } sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL); /* How long is a cookie good for ? */ m->def_cookie_life = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default)); /* * Initialize authentication parameters */ m->local_hmacs = sctp_default_supported_hmaclist(); m->local_auth_chunks = sctp_alloc_chunklist(); if (inp->asconf_supported) { sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks); sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks); } m->default_dscp = 0; #ifdef INET6 m->default_flowlabel = 0; #endif m->port = 0; /* encapsulation disabled by default */ LIST_INIT(&m->shared_keys); /* add default NULL key as key id 0 */ null_key = sctp_alloc_sharedkey(); sctp_insert_sharedkey(&m->shared_keys, null_key); SCTP_INP_WUNLOCK(inp); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 12); #endif return (error); } void sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, struct sctp_tcb *stcb) { struct sctp_nets *net; uint16_t lport, rport; struct sctppcbhead *head; struct sctp_laddr *laddr, *oladdr; atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(old_inp); SCTP_INP_WLOCK(new_inp); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #ifdef INET6 if (old_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { new_inp->ip_inp.inp.inp_flags |= old_inp->ip_inp.inp.inp_flags & INP_CONTROLOPTS; if (old_inp->ip_inp.inp.in6p_outputopts) { new_inp->ip_inp.inp.in6p_outputopts = ip6_copypktopts(old_inp->ip_inp.inp.in6p_outputopts, M_NOWAIT); } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { new_inp->ip_inp.inp.inp_ip_tos = old_inp->ip_inp.inp.inp_ip_tos; new_inp->ip_inp.inp.inp_ip_ttl = old_inp->ip_inp.inp.inp_ip_ttl; } #endif new_inp->sctp_ep.time_of_secret_change = old_inp->sctp_ep.time_of_secret_change; memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key, sizeof(old_inp->sctp_ep.secret_key)); new_inp->sctp_ep.current_secret_number = old_inp->sctp_ep.current_secret_number; new_inp->sctp_ep.last_secret_number = old_inp->sctp_ep.last_secret_number; new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie; /* make it so new data pours into the new socket */ stcb->sctp_socket = new_inp->sctp_socket; stcb->sctp_ep = new_inp; /* Copy the port across */ lport = new_inp->sctp_lport = old_inp->sctp_lport; rport = stcb->rport; /* Pull the tcb from the old association */ LIST_REMOVE(stcb, sctp_tcbhash); LIST_REMOVE(stcb, sctp_tcblist); if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } /* Now insert the new_inp into the TCP connected hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; LIST_INSERT_HEAD(head, new_inp, sctp_hash); /* Its safe to access */ new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; /* Now move the tcb into the endpoint list */ LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist); /* * Question, do we even need to worry about the ep-hash since we * only have one connection? Probably not :> so lets get rid of it * and not suck up any kernel memory in that. */ if (stcb->asoc.in_asocid_hash) { struct sctpasochead *lhd; lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id, new_inp->hashasocidmark)]; LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash); } /* Ok. Let's restart timer. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp, stcb, net); } SCTP_INP_INFO_WUNLOCK(); if (new_inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark); new_inp->sctp_tcbhash = NULL; } if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* Subset bound, so copy in the laddr list from the old_inp */ LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) { laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* * Gak, what can we do? This assoc is really * HOSED. We probably should send an abort * here. */ SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n"); continue; } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = oladdr->ifa; atomic_add_int(&laddr->ifa->refcount, 1); LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr, sctp_nxt_addr); new_inp->laddr_count++; if (oladdr == stcb->asoc.last_used_address) { stcb->asoc.last_used_address = laddr; } } } /* Now any running timers need to be adjusted. */ if (stcb->asoc.dack_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.dack_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.asconf_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.asconf_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.strreset_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.strreset_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.shut_guard_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.shut_guard_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.autoclose_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.autoclose_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.delete_prim_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.delete_prim_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } /* now what about the nets? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->pmtu_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->pmtu_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->hb_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->hb_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->rxt_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->rxt_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } } SCTP_INP_WUNLOCK(new_inp); SCTP_INP_WUNLOCK(old_inp); } /* * insert an laddr entry with the given ifa for the desired list */ static int sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) { struct sctp_laddr *laddr; laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = ifa; laddr->action = act; atomic_add_int(&ifa->refcount, 1); /* insert it */ LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); return (0); } /* * Remove an laddr entry from the local address list (on an assoc) */ static void sctp_remove_laddr(struct sctp_laddr *laddr) { /* remove from the list */ LIST_REMOVE(laddr, sctp_nxt_addr); sctp_free_ifa(laddr->ifa); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); SCTP_DECR_LADDR_COUNT(); } /* * Bind the socket, with the PCB and global info locks held. Note, if a * socket address is specified, the PCB lock may be dropped and re-acquired. * * sctp_ifap is used to bypass normal local address validation checks. */ int sctp_inpcb_bind_locked(struct sctp_inpcb *inp, struct sockaddr *addr, struct sctp_ifa *sctp_ifap, struct thread *td) { /* bind a ep to a socket address */ struct sctppcbhead *head; struct sctp_inpcb *inp_tmp; struct inpcb *ip_inp; int port_reuse_active = 0; int bindall; uint16_t lport; int error; uint32_t vrf_id; KASSERT(td != NULL, ("%s: null thread", __func__)); error = 0; lport = 0; bindall = 1; ip_inp = &inp->ip_inp.inp; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); #ifdef SCTP_DEBUG if (addr) { SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port: %d\n", ntohs(((struct sockaddr_in *)addr)->sin_port)); SCTPDBG(SCTP_DEBUG_PCB1, "Addr: "); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); } #endif if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { error = EINVAL; /* already did a bind, subsequent binds NOT allowed ! */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (addr != NULL) { switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; /* IPV6_V6ONLY socket? */ if (SCTP_IPV6_V6ONLY(inp)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (addr->sa_len != sizeof(*sin)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } sin = (struct sockaddr_in *)addr; lport = sin->sin_port; /* * For LOOPBACK the prison_local_ip4() call * will transmute the ip address to the * proper value. */ if ((error = prison_local_ip4(td->td_ucred, &sin->sin_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (sin->sin_addr.s_addr != INADDR_ANY) { bindall = 0; } break; } #endif #ifdef INET6 case AF_INET6: { /* * Only for pure IPv6 Address. (No IPv4 * Mapped!) */ struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (addr->sa_len != sizeof(*sin6)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } lport = sin6->sin6_port; /* * For LOOPBACK the prison_local_ip6() call * will transmute the ipv6 address to the * proper value. */ if ((error = prison_local_ip6(td->td_ucred, &sin6->sin6_addr, (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { bindall = 0; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; break; } #endif default: error = EAFNOSUPPORT; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } /* Setup a vrf_id to be the default for the non-bind-all case. */ vrf_id = inp->def_vrf_id; if (lport) { /* * Did the caller specify a port? if so we must see if an ep * already has this one bound. */ /* got to be root to get at low ports */ if (ntohs(lport) < IPPORT_RESERVED && (error = priv_check(td, PRIV_NETINET_RESERVEDPORT)) != 0) { goto out; } SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); if (bindall) { vrf_id = inp->def_vrf_id; inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } else { inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } continue_anyway: SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); if (bindall) { /* verify that no lport is not used by a singleton */ if ((port_reuse_active == 0) && (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))) { /* Sorry someone already has this one bound */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { port_reuse_active = 1; } else { error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } } } else { uint16_t first, last, candidate; uint16_t count; if (ip_inp->inp_flags & INP_HIGHPORT) { first = MODULE_GLOBAL(ipport_hifirstauto); last = MODULE_GLOBAL(ipport_hilastauto); } else if (ip_inp->inp_flags & INP_LOWPORT) { if ((error = priv_check(td, PRIV_NETINET_RESERVEDPORT)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } first = MODULE_GLOBAL(ipport_lowfirstauto); last = MODULE_GLOBAL(ipport_lowlastauto); } else { first = MODULE_GLOBAL(ipport_firstauto); last = MODULE_GLOBAL(ipport_lastauto); } if (first > last) { uint16_t temp; temp = first; first = last; last = temp; } count = last - first + 1; /* number of candidates */ candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count); for (;;) { if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) { lport = htons(candidate); break; } if (--count == 0) { error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (candidate == last) candidate = first; else candidate = candidate + 1; } } if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* * this really should not happen. The guy did a non-blocking * bind and then did a close at the same time. */ error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } /* ok we look clear to give out this port, so lets setup the binding */ if (bindall) { /* binding to all addresses, so just set in the proper flags */ inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL; /* set the automatic addr changes from kernel flag */ if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } /* * set the automatic mobility_base from kernel flag (by * micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } /* * set the automatic mobility_fasthandoff from kernel flag * (by micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } } else { /* * bind specific, make sure flags is off and add a new * address structure to the sctp_addr_list inside the ep * structure. * * We will need to allocate one and insert it at the head. * The socketopt call can just insert new addresses in there * as well. It will also have to do the embed scope kame * hack too (before adding). */ struct sctp_ifa *ifa; union sctp_sockstore store; memset(&store, 0, sizeof(store)); switch (addr->sa_family) { #ifdef INET case AF_INET: memcpy(&store.sin, addr, sizeof(struct sockaddr_in)); store.sin.sin_port = 0; break; #endif #ifdef INET6 case AF_INET6: memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6)); store.sin6.sin6_port = 0; break; #endif default: break; } /* * first find the interface with the bound address need to * zero out the port to find the address! yuck! can't do * this earlier since need port for sctp_pcb_findep() */ if (sctp_ifap != NULL) { ifa = sctp_ifap; } else { /* * Note for BSD we hit here always other O/S's will * pass things in via the sctp_ifap argument. */ ifa = sctp_find_ifa_by_addr(&store.sa, vrf_id, SCTP_ADDR_NOT_LOCKED); } if (ifa == NULL) { error = EADDRNOTAVAIL; /* Can't find an interface with that address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } #ifdef INET6 if (addr->sa_family == AF_INET6) { /* GAK, more FIXME IFA lock? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } #endif /* we're not bound all */ inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL; /* allow bindx() to send ASCONF's for binding changes */ sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); /* clear automatic addr changes from kernel flag */ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); /* add this address to the endpoint list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0); if (error != 0) goto out; inp->laddr_count++; } /* find the bucket */ if (port_reuse_active) { /* Put it into tcp 1-2-1 hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))]; inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; } else { head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; } /* put it in the bucket */ LIST_INSERT_HEAD(head, inp, sctp_hash); SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n", (void *)head, ntohs(lport), port_reuse_active); /* set in the port */ inp->sctp_lport = lport; /* turn off just the unbound flag */ KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) != 0, ("%s: inp %p is already bound", __func__, inp)); inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; out: return (error); } int sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, struct sctp_ifa *sctp_ifap, struct thread *td) { struct sctp_inpcb *inp; int error; inp = so->so_pcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); error = sctp_inpcb_bind_locked(inp, addr, sctp_ifap, td); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (error); } static void sctp_iterator_inp_being_freed(struct sctp_inpcb *inp) { struct sctp_iterator *it, *nit; /* * We enter with the only the ITERATOR_LOCK in place and a write * lock on the inp_info stuff. */ it = sctp_it_ctl.cur_it; if (it && (it->vn != curvnet)) { /* Its not looking at our VNET */ return; } if (it && (it->inp == inp)) { /* * This is tricky and we hold the iterator lock, but when it * returns and gets the lock (when we release it) the * iterator will try to operate on inp. We need to stop that * from happening. But of course the iterator has a * reference on the stcb and inp. We can mark it and it will * stop. * * If its a single iterator situation, we set the end * iterator flag. Otherwise we set the iterator to go to the * next inp. * */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } else { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP; } } /* * Now go through and remove any single reference to our inp that * may be still pending on the list */ SCTP_IPI_ITERATOR_WQ_LOCK(); TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } if (it->inp == inp) { /* This one points to me is it inp specific? */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { /* Remove and free this one */ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } else { it->inp = LIST_NEXT(it->inp, sctp_list); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } } /* * When its put in the refcnt is incremented so decr * it */ SCTP_INP_DECR_REF(inp); } } SCTP_IPI_ITERATOR_WQ_UNLOCK(); } /* release sctp_inpcb unbind the port */ void sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) { /* * Here we free a endpoint. We must find it (if it is in the Hash * table) and remove it from there. Then we must also find it in the * overall list and remove it from there. After all removals are * complete then any timer has to be stopped. Then start the actual * freeing. a) Any local lists. b) Any associations. c) The hash of * all associations. d) finally the ep itself. */ struct sctp_tcb *stcb, *nstcb; struct sctp_laddr *laddr, *nladdr; struct inpcb *ip_pcb; struct socket *so; int being_refed = 0; struct sctp_queued_to_read *sq, *nsq; int cnt; sctp_sharedkey_t *shared_key, *nshared_key; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 0); #endif SCTP_ITERATOR_LOCK(); /* mark any iterators on the list or being processed */ sctp_iterator_inp_being_freed(inp); SCTP_ITERATOR_UNLOCK(); SCTP_ASOC_CREATE_LOCK(inp); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); so = inp->sctp_socket; KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) != 0, ("%s: inp %p still has socket", __func__, inp)); KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0, ("%s: double free of inp %p", __func__, inp)); if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP; /* socket is gone, so no more wakeups allowed */ inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; } /* First time through we have the socket lock, after that no more. */ sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_1); if (inp->control) { sctp_m_freem(inp->control); inp->control = NULL; } if (inp->pkt) { sctp_m_freem(inp->pkt); inp->pkt = NULL; } ip_pcb = &inp->ip_inp.inp; /* we could just cast the main pointer * here but I will be nice :> (i.e. * ip_pcb = ep;) */ if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { int cnt_in_sd; cnt_in_sd = 0; LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_list, sctp_tcblist, nstcb) { SCTP_TCB_LOCK(stcb); /* Disconnect the socket please. */ stcb->sctp_socket = NULL; SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_CLOSED_SOCKET); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* Skip guys being freed */ cnt_in_sd++; if (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { /* * Special case - we did not start a * kill timer on the asoc due to it * was not closed. So go ahead and * start it now. */ SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } SCTP_TCB_UNLOCK(stcb); continue; } if (((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) && (stcb->asoc.total_output_queue_size == 0)) { /* * If we have data in queue, we don't want * to just free since the app may have done, * send()/close or connect/send/close. And * it wants the data to get across first. */ /* Just abandon things in the front states */ if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) { cnt_in_sd++; } continue; } if ((stcb->asoc.size_on_reasm_queue > 0) || (stcb->asoc.control_pdapi) || (stcb->asoc.size_on_all_streams > 0) || ((so != NULL) && (SCTP_SBAVAIL(&so->so_rcv) > 0))) { /* Left with Data unread */ struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) { cnt_in_sd++; } continue; } else if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { if ((*stcb->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, &stcb->asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* * there is nothing queued to send, * so I send shutdown */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED); } } else { /* mark into shutdown pending */ SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); if ((*stcb->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, &stcb->asoc)) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) { cnt_in_sd++; } continue; } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); } } cnt_in_sd++; SCTP_TCB_UNLOCK(stcb); } /* now is there some left in our SHUTDOWN state? */ if (cnt_in_sd) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 2); #endif inp->sctp_socket = NULL; SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } } inp->sctp_socket = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { /* * ok, this guy has been bound. It's port is somewhere in * the SCTP_BASE_INFO(hash table). Remove it! */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND; } /* * If there is a timer running to kill us, forget it, since it may * have a contest on the INP lock.. which would cause us to die ... */ cnt = 0; LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_list, sctp_tcblist, nstcb) { SCTP_TCB_LOCK(stcb); if (immediate != SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { /* Disconnect the socket please */ stcb->sctp_socket = NULL; SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_CLOSED_SOCKET); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { if (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } cnt++; SCTP_TCB_UNLOCK(stcb); continue; } /* Free associations that are NOT killing us */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) && ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) { struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); } else if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { cnt++; SCTP_TCB_UNLOCK(stcb); continue; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { cnt++; } } if (cnt) { /* Ok we have someone out there that will kill us */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 3); #endif SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } if (SCTP_INP_LOCK_CONTENDED(inp)) being_refed++; if (SCTP_INP_READ_CONTENDED(inp)) being_refed++; if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp)) being_refed++; /* NOTE: 0 refcount also means no timers are referencing us. */ if ((inp->refcount) || (being_refed) || (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 4); #endif sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } inp->sctp_ep.signature_change.type = 0; inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE; /* * Remove it from the list .. last thing we need a lock for. */ LIST_REMOVE(inp, sctp_list); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 5); #endif if ((inp->sctp_asocidhash) != NULL) { SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark); inp->sctp_asocidhash = NULL; } /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(sq, &inp->read_queue, next, nsq) { /* Its only abandoned if it had data left */ if (sq->length) SCTP_STAT_INCR(sctps_left_abandon); TAILQ_REMOVE(&inp->read_queue, sq, next); sctp_free_remote_addr(sq->whoFrom); if (so) so->so_rcv.sb_cc -= sq->length; if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } /* * no need to free the net count, since at this point all * assoc's are gone. */ sctp_free_a_readq(NULL, sq); } /* Now the sctp_pcb things */ /* * free each asoc if it is not already closed/free. we can't use the * macro here since le_next will get freed as part of the * sctp_free_assoc() call. */ if (ip_pcb->inp_options) { (void)sctp_m_free(ip_pcb->inp_options); ip_pcb->inp_options = 0; } #ifdef INET6 if (ip_pcb->inp_vflag & INP_IPV6) { ip6_freepcbopts(ip_pcb->in6p_outputopts); } #endif /* INET6 */ ip_pcb->inp_vflag = 0; /* free up authentication fields */ if (inp->sctp_ep.local_auth_chunks != NULL) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); if (inp->sctp_ep.local_hmacs != NULL) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); LIST_FOREACH_SAFE(shared_key, &inp->sctp_ep.shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* * if we have an address list the following will free the list of * ifaddr's that are set into this ep. Again macro limitations here, * since the LIST_FOREACH could be a bad idea. */ LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { sctp_remove_laddr(laddr); } #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_free_list, sctp_tcblist, nstcb) { LIST_REMOVE(stcb, sctp_tcblist); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); } /* *** END TEMP CODE *** */ #endif /* Now lets see about freeing the EP hash table. */ if (inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark); inp->sctp_tcbhash = NULL; } /* Now we must put the ep memory back into the zone pool */ crfree(inp->ip_inp.inp.inp_cred); INP_LOCK_DESTROY(&inp->ip_inp.inp); SCTP_INP_LOCK_DESTROY(inp); SCTP_INP_READ_DESTROY(inp); SCTP_ASOC_CREATE_LOCK_DESTROY(inp); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_DECR_EP_COUNT(); } struct sctp_nets * sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr) { struct sctp_nets *net; /* locate the address */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr)) return (net); } return (NULL); } int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id) { struct sctp_ifa *sctp_ifa; sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED); if (sctp_ifa) { return (1); } else { return (0); } } /* * add's a remote endpoint address, done with the INIT/INIT-ACK as well as * when a ASCONF arrives that adds it. It will also initialize all the cwnd * stats of stuff. */ int sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, struct sctp_nets **netp, uint16_t port, int set_scope, int from) { /* * The following is redundant to the same lines in the * sctp_aloc_assoc() but is needed since others call the add address * function */ struct sctp_nets *net, *netfirst; int addr_inscope; SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ", from); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr); netfirst = sctp_findnet(stcb, newaddr); if (netfirst) { /* * Lie and return ok, we don't want to make the association * go away for this behavior. It will happen in the TCP * model in a connected socket. It does not reach the hash * table until after the association is built so it can't be * found. Mark as reachable, since the initial creation will * have been cleared and the NOT_IN_ASSOC flag will have * been added... and we don't want to end up removing it * back out. */ if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) { netfirst->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED); } else { netfirst->dest_state = SCTP_ADDR_REACHABLE; } return (0); } addr_inscope = 1; switch (newaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)newaddr; if (sin->sin_addr.s_addr == 0) { /* Invalid address */ return (-1); } /* zero out the zero area */ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); /* assure len is set */ sin->sin_len = sizeof(struct sockaddr_in); if (set_scope) { if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { stcb->asoc.scope.ipv4_local_scope = 1; } } else { /* Validate the address is in scope */ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) && (stcb->asoc.scope.ipv4_local_scope == 0)) { addr_inscope = 0; } } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)newaddr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Invalid address */ return (-1); } /* assure len is set */ sin6->sin6_len = sizeof(struct sockaddr_in6); if (set_scope) { if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { /* * If the new destination is a * LINK_LOCAL we must have common * site scope. Don't set the local * scope since we may not share all * links, only loopback can do this. * Links on the local network would * also be on our private network * for v4 too. */ stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { /* * If the new destination is * SITE_LOCAL then we must have site * scope in common. */ stcb->asoc.scope.site_scope = 1; } } else { /* Validate the address is in scope */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) && (stcb->asoc.scope.loopback_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.local_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.site_scope == 0)) { addr_inscope = 0; } } break; } #endif default: /* not supported family type */ return (-1); } net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets); if (net == NULL) { return (-1); } SCTP_INCR_RADDR_COUNT(); memset(net, 0, sizeof(struct sctp_nets)); (void)SCTP_GETTIME_TIMEVAL(&net->start_time); memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len); switch (newaddr->sa_family) { #ifdef INET case AF_INET: ((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport; break; #endif #ifdef INET6 case AF_INET6: ((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport; break; #endif default: break; } net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id); if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.site_scope = 1; addr_inscope = 1; } net->failure_threshold = stcb->asoc.def_net_failure; net->pf_threshold = stcb->asoc.def_net_pf_threshold; if (addr_inscope == 0) { net->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_OUT_OF_SCOPE); } else { if (from == SCTP_ADDR_IS_CONFIRMED) /* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */ net->dest_state = SCTP_ADDR_REACHABLE; else net->dest_state = SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED; } /* * We set this to 0, the timer code knows that this means its an * initial value */ net->rto_needed = 1; net->RTO = 0; net->RTO_measured = 0; stcb->asoc.numnets++; net->ref_count = 1; net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1; net->port = port; net->dscp = stcb->asoc.default_dscp; #ifdef INET6 net->flowlabel = stcb->asoc.default_flowlabel; #endif if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { net->dest_state |= SCTP_ADDR_NOHB; } else { net->dest_state &= ~SCTP_ADDR_NOHB; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) { net->dest_state |= SCTP_ADDR_NO_PMTUD; } else { net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } net->heart_beat_delay = stcb->asoc.heart_beat_delay; /* Init the timer structure */ SCTP_OS_TIMER_INIT(&net->rxt_timer.timer); SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer); SCTP_OS_TIMER_INIT(&net->hb_timer.timer); /* Now generate a route for this guy */ #ifdef INET6 /* KAME hack: embed scopeid */ if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); sin6->sin6_scope_id = 0; } #endif SCTP_RTALLOC((sctp_route_t *)&net->ro, stcb->asoc.vrf_id, stcb->sctp_ep->fibnum); net->src_addr_selected = 0; if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) { /* Get source address */ net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep, stcb, (sctp_route_t *)&net->ro, net, 0, stcb->asoc.vrf_id); if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else if (net->ro._s_addr != NULL) { uint32_t imtu, rmtu, hcmtu; net->src_addr_selected = 1; /* Now get the interface MTU */ if (net->ro._s_addr->ifn_p != NULL) { /* * XXX: Should we here just use * net->ro._s_addr->ifn_p->ifn_mtu */ imtu = SCTP_GATHER_MTU_FROM_IFN_INFO(net->ro._s_addr->ifn_p->ifn_p, net->ro._s_addr->ifn_p->ifn_index); } else { imtu = 0; } rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh); hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum); net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu); } } if (net->mtu == 0) { if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else { switch (newaddr->sa_family) { #ifdef INET case AF_INET: net->mtu = SCTP_DEFAULT_MTU; break; #endif #ifdef INET6 case AF_INET6: net->mtu = 1280; break; #endif default: break; } } } #if defined(INET) || defined(INET6) if (net->port) { net->mtu -= (uint32_t)sizeof(struct udphdr); } #endif if (from == SCTP_ALLOC_ASOC) { stcb->asoc.smallest_mtu = net->mtu; } if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu, true); } #ifdef INET6 if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_recoverscope(sin6); } #endif /* JRS - Use the congestion control given in the CC module */ if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) (*stcb->asoc.cc_functions.sctp_set_initial_cc_param) (stcb, net); /* * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning * of assoc (2005/06/27, iyengar@cis.udel.edu) */ net->find_pseudo_cumack = 1; net->find_rtx_pseudo_cumack = 1; /* Choose an initial flowid. */ net->flowid = stcb->asoc.my_vtag ^ ntohs(stcb->rport) ^ ntohs(stcb->sctp_ep->sctp_lport); net->flowtype = M_HASHTYPE_OPAQUE_HASH; if (netp) { *netp = net; } netfirst = TAILQ_FIRST(&stcb->asoc.nets); if (net->ro.ro_nh == NULL) { /* Since we have no route put it at the back */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); } else if (netfirst == NULL) { /* We are the first one in the pool. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (netfirst->ro.ro_nh == NULL) { /* * First one has NO route. Place this one ahead of the first * one. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) { /* * This one has a different interface than the one at the * top of the list. Place it ahead. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else { /* * Ok we have the same interface as the first one. Move * forward until we find either a) one with a NULL route... * insert ahead of that b) one with a different ifp.. insert * after that. c) end of the list.. insert at the tail. */ struct sctp_nets *netlook; do { netlook = TAILQ_NEXT(netfirst, sctp_next); if (netlook == NULL) { /* End of the list */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); break; } else if (netlook->ro.ro_nh == NULL) { /* next one has NO route */ TAILQ_INSERT_BEFORE(netfirst, net, sctp_next); break; } else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) { TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook, net, sctp_next); break; } /* Shift forward */ netfirst = netlook; } while (netlook != NULL); } /* got to have a primary set */ if (stcb->asoc.primary_destination == 0) { stcb->asoc.primary_destination = net; } else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) && (net->ro.ro_nh) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) { /* No route to current primary adopt new primary */ stcb->asoc.primary_destination = net; } /* Validate primary is first */ net = TAILQ_FIRST(&stcb->asoc.nets); if ((net != stcb->asoc.primary_destination) && (stcb->asoc.primary_destination)) { /* * first one on the list is NOT the primary sctp_cmpaddr() * is much more efficient if the primary is the first on the * list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } static uint32_t sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb) { uint32_t id; struct sctpasochead *head; struct sctp_tcb *lstcb; try_again: if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* TSNH */ return (0); } /* * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC, * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC. */ if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) { inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1; } id = inp->sctp_associd_counter; inp->sctp_associd_counter++; lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t)id, 0); if (lstcb) { goto try_again; } head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash); stcb->asoc.in_asocid_hash = 1; return (id); } /* * allocate an association and add it to the endpoint. The caller must be * careful to add all additional addresses once they are know right away or * else the assoc will be may experience a blackout scenario. */ static struct sctp_tcb * sctp_aloc_assoc_locked(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { /* note the p argument is only valid in unbound sockets */ struct sctp_tcb *stcb; struct sctp_association *asoc; struct sctpasochead *head; uint16_t rport; int err; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); /* * Assumption made here: Caller has done a * sctp_findassociation_ep_addr(ep, addr's); to make sure the * address does not exist already. */ if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) { /* Hit max assoc, sorry no more */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } if (firstaddr == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) || (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { /* * If its in the TCP pool, its NOT allowed to create an * association. The parent listener needs to call * sctp_aloc_assoc.. or the one-2-many socket. If a peeled * off, or connected one does this.. its an error. */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } } SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:"); #ifdef SCTP_DEBUG if (firstaddr) { SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr); switch (firstaddr->sa_family) { #ifdef INET case AF_INET: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in *)firstaddr)->sin_port)); break; #endif #ifdef INET6 case AF_INET6: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in6 *)firstaddr)->sin6_port)); break; #endif default: break; } } else { SCTPDBG(SCTP_DEBUG_PCB3, "None\n"); } #endif /* SCTP_DEBUG */ switch (firstaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)firstaddr; if ((ntohs(sin->sin_port) == 0) || (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (SCTP_IPV6_V6ONLY(inp) != 0))) { /* Invalid address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin->sin_port; break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)firstaddr; if ((ntohs(sin6->sin6_port) == 0) || IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0)) { /* Invalid address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin6->sin6_port; break; } #endif default: /* not supported family type */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { /* * If you have not performed a bind, then we need to do the * ephemeral bind for you. */ if ((err = sctp_inpcb_bind_locked(inp, NULL, NULL, p))) { /* bind error, probably perm */ *error = err; return (NULL); } } stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb); if (stcb == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); *error = ENOMEM; return (NULL); } SCTP_INCR_ASOC_COUNT(); memset(stcb, 0, sizeof(*stcb)); asoc = &stcb->asoc; SCTP_TCB_LOCK_INIT(stcb); stcb->rport = rport; /* setup back pointer's */ stcb->sctp_ep = inp; stcb->sctp_socket = inp->sctp_socket; if ((err = sctp_init_asoc(inp, stcb, override_tag, initial_tsn, vrf_id, o_streams))) { /* failed */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); *error = err; return (NULL); } SCTP_TCB_LOCK(stcb); asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); /* now that my_vtag is set, add it to the hash */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* put it in the bucket in the vtag hash of assoc's for the system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); if (sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC)) { /* failure.. memory error? */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } SCTP_DECR_ASOC_COUNT(); SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); LIST_REMOVE(stcb, sctp_asocs); LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } /* Init all the timers */ SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer); SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer); SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer); SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer); SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer); SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer); LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist); /* now file the port under the hash as well */ if (inp->sctp_tcbhash != NULL) { head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport, inp->sctp_hashmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbhash); } if (initialize_auth_params == SCTP_INITIALIZE_AUTH_PARAMS) { sctp_initialize_auth_params(inp, stcb); } SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb); return (stcb); } struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { struct sctp_tcb *stcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); stcb = sctp_aloc_assoc_locked(inp, firstaddr, error, override_tag, initial_tsn, vrf_id, o_streams, port, p, initialize_auth_params); SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); return (stcb); } struct sctp_tcb * sctp_aloc_assoc_connected(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { struct sctp_tcb *stcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && SCTP_IS_LISTENING(inp)) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); *error = EINVAL; return (NULL); } stcb = sctp_aloc_assoc_locked(inp, firstaddr, error, override_tag, initial_tsn, vrf_id, o_streams, port, p, initialize_auth_params); SCTP_INP_INFO_WUNLOCK(); if (stcb != NULL && (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) { inp->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; soisconnecting(inp->sctp_socket); } SCTP_INP_WUNLOCK(inp); return (stcb); } void sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_inpcb *inp; struct sctp_association *asoc; inp = stcb->sctp_ep; asoc = &stcb->asoc; asoc->numnets--; TAILQ_REMOVE(&asoc->nets, net, sctp_next); if (net == asoc->primary_destination) { /* Reset primary */ struct sctp_nets *lnet; lnet = TAILQ_FIRST(&asoc->nets); /* * Mobility adaptation Ideally, if deleted destination is * the primary, it becomes a fast retransmission trigger by * the subsequent SET PRIMARY. (by micchie) */ if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n"); if (asoc->deleted_primary != NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n"); goto out; } asoc->deleted_primary = net; atomic_add_int(&net->ref_count, 1); memset(&net->lastsa, 0, sizeof(net->lastsa)); memset(&net->lastsv, 0, sizeof(net->lastsv)); sctp_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED); sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL); } out: /* Try to find a confirmed primary */ asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0); } if (net == asoc->last_data_chunk_from) { /* Reset primary */ asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets); } if (net == asoc->last_control_chunk_from) { /* Clear net */ asoc->last_control_chunk_from = NULL; } if (net == asoc->last_net_cmt_send_started) { /* Clear net */ asoc->last_net_cmt_send_started = NULL; } if (net == stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_9); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_10); net->dest_state |= SCTP_ADDR_BEING_DELETED; sctp_free_remote_addr(net); } /* * remove a remote endpoint address from an association, it will fail if the * address does not exist. */ int sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr) { /* * Here we need to remove a remote address. This is quite simple, we * first find it in the list of address for the association * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE * on that item. Note we do not allow it to be removed if there are * no other addresses. */ struct sctp_association *asoc; struct sctp_nets *net, *nnet; asoc = &stcb->asoc; /* locate the address */ TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) { continue; } if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr, remaddr)) { /* we found the guy */ if (asoc->numnets < 2) { /* Must have at LEAST two remote addresses */ return (-1); } else { sctp_remove_net(stcb, net); return (0); } } } /* not found. */ return (-2); } static bool sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport, uint32_t now) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; int i; SCTP_INP_INFO_LOCK_ASSERT(); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].tv_sec_at_expire >= now) && (twait_block->vtag_block[i].v_tag == tag) && (twait_block->vtag_block[i].lport == lport) && (twait_block->vtag_block[i].rport == rport)) { return (true); } } } return (false); } static void sctp_set_vtag_block(struct sctp_timewait *vtag_block, uint32_t time, uint32_t tag, uint16_t lport, uint16_t rport) { vtag_block->tv_sec_at_expire = time; vtag_block->v_tag = tag; vtag_block->lport = lport; vtag_block->rport = rport; } static void sctp_add_vtag_to_timewait(uint32_t tag, uint16_t lport, uint16_t rport) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; struct timeval now; uint32_t time; int i; bool set; SCTP_INP_INFO_WLOCK_ASSERT(); (void)SCTP_GETTIME_TIMEVAL(&now); time = (uint32_t)now.tv_sec + SCTP_BASE_SYSCTL(sctp_vtag_time_wait); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; set = false; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* Block(s) present, lets find space, and expire on the fly */ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].v_tag == 0) && !set) { sctp_set_vtag_block(twait_block->vtag_block + i, time, tag, lport, rport); set = true; continue; } if ((twait_block->vtag_block[i].v_tag != 0) && (twait_block->vtag_block[i].tv_sec_at_expire < (uint32_t)now.tv_sec)) { if (set) { /* Audit expires this guy */ sctp_set_vtag_block(twait_block->vtag_block + i, 0, 0, 0, 0); } else { /* Reuse it for the new tag */ sctp_set_vtag_block(twait_block->vtag_block + i, time, tag, lport, rport); set = true; } } } if (set) { /* * We only do up to the block where we can place our * tag for audits */ break; } } /* Need to add a new block to chain */ if (!set) { SCTP_MALLOC(twait_block, struct sctp_tagblock *, sizeof(struct sctp_tagblock), SCTP_M_TIMW); if (twait_block == NULL) { return; } memset(twait_block, 0, sizeof(struct sctp_tagblock)); LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock); sctp_set_vtag_block(twait_block->vtag_block, time, tag, lport, rport); } } void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh) { struct sctp_tmit_chunk *chk, *nchk; struct sctp_queued_to_read *control, *ncontrol; TAILQ_FOREACH_SAFE(control, rh, next_instrm, ncontrol) { TAILQ_REMOVE(rh, control, next_instrm); control->on_strm_q = 0; if (control->on_read_q == 0) { sctp_free_remote_addr(control->whoFrom); if (control->data) { sctp_m_freem(control->data); control->data = NULL; } } /* Reassembly free? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* * We don't free the address here since all the net's were * freed above. */ if (control->on_read_q == 0) { sctp_free_a_readq(stcb, control); } } } /*- * Free the association after un-hashing the remote port. This * function ALWAYS returns holding NO LOCK on the stcb. It DOES * expect that the input to this function IS a locked TCB. * It will return 0, if it did NOT destroy the association (instead * it unlocks it. It will return NON-zero if it either destroyed the * association OR the association is already destroyed. */ int sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location) { int i; struct sctp_association *asoc; struct sctp_nets *net, *nnet; struct sctp_laddr *laddr, *naddr; struct sctp_tmit_chunk *chk, *nchk; struct sctp_asconf_addr *aparam, *naparam; struct sctp_asconf_ack *aack, *naack; struct sctp_stream_reset_list *strrst, *nstrrst; struct sctp_queued_to_read *sq, *nsq; struct sctp_stream_queue_pending *sp, *nsp; sctp_sharedkey_t *shared_key, *nshared_key; struct socket *so; /* first, lets purge the entry from the hash table. */ SCTP_TCB_LOCK_ASSERT(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 6); #endif if (stcb->asoc.state == 0) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 7); #endif /* there is no asoc, really TSNH :-0 */ return (1); } if (stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } /* TEMP CODE */ if (stcb->freed_from_where == 0) { /* Only record the first place free happened from */ stcb->freed_from_where = from_location; } /* TEMP CODE */ asoc = &stcb->asoc; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; else so = inp->sctp_socket; /* * We used timer based freeing if a reader or writer is in the way. * So we first check if we are actually being called from a timer, * if so we abort early if a reader or writer is still in the way. */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) && (from_inpcbfree == SCTP_NORMAL_PROC)) { /* * is it the timer driving us? if so are the reader/writers * gone? */ if (stcb->asoc.refcnt) { /* nope, reader or writer in the way */ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); /* no asoc destroyed */ SCTP_TCB_UNLOCK(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 8); #endif return (0); } } /* Now clean up any other timers */ sctp_stop_association_timers(stcb, false); /* Now the read queue needs to be cleaned up (only once) */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_ABOUT_TO_BE_FREED); SCTP_INP_READ_LOCK(inp); TAILQ_FOREACH(sq, &inp->read_queue, next) { if (sq->stcb == stcb) { sq->do_not_ref_stcb = 1; sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn; /* * If there is no end, there never will be * now. */ if (sq->end_added == 0) { /* Held for PD-API clear that. */ sq->pdapi_aborted = 1; sq->held_length = 0; if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { /* * Need to add a PD-API * aborted indication. * Setting the control_pdapi * assures that it will be * added right after this * msg. */ uint32_t strseq; stcb->asoc.control_pdapi = sq; strseq = (sq->sinfo_stream << 16) | (sq->mid & 0x0000ffff); sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, stcb, SCTP_PARTIAL_DELIVERY_ABORTED, (void *)&strseq, SCTP_SO_LOCKED); stcb->asoc.control_pdapi = NULL; } } /* Add an end to wake them */ sq->end_added = 1; } } SCTP_INP_READ_UNLOCK(inp); if (stcb->block_entry) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET); stcb->block_entry->error = ECONNRESET; stcb->block_entry = NULL; } } if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) { /* * Someone holds a reference OR the socket is unaccepted * yet. */ if ((stcb->asoc.refcnt) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if (so) { /* Wake any reader/writers */ sctp_sorwakeup(inp, so); sctp_sowwakeup(inp, so); } SCTP_TCB_UNLOCK(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 9); #endif /* no asoc destroyed */ return (0); } #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 10); #endif /* * When I reach here, no others want to kill the assoc yet.. and I * own the lock. Now its possible an abort comes in when I do the * lock exchange below to grab all the locks to do the final take * out. to prevent this we increment the count, which will start a * timer and blow out above thus assuring us that we hold exclusive * killing of the asoc. Note that after getting back the TCB lock we * will go ahead and increment the counter back up and stop any * timer a passing stranger may have started :-S */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); SCTP_TCB_LOCK(stcb); } /* Double check the GONE flag */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* * For TCP type we need special handling when we are * connected. We also include the peel'ed off ones to. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED; inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED; if (so) { SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING | SS_ISCONNECTED); so->so_state |= SS_ISDISCONNECTED; socantrcvmore_locked(so); socantsendmore(so); sctp_sowwakeup(inp, so); sctp_sorwakeup(inp, so); SCTP_SOWAKEUP(so); } } } /* * Make it invalid too, that way if its about to run it will abort * and return. */ /* re-increment the lock */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_subtract_int(&stcb->asoc.refcnt, 1); } if (stcb->asoc.refcnt) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); } SCTP_TCB_UNLOCK(stcb); return (0); } asoc->state = 0; if (inp->sctp_tcbhash) { LIST_REMOVE(stcb, sctp_tcbhash); } if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } if (inp->sctp_socket == NULL) { stcb->sctp_socket = NULL; } /* Now lets remove it from the list of ALL associations in the EP */ LIST_REMOVE(stcb, sctp_tcblist); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); } /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); sctp_add_vtag_to_timewait(asoc->my_vtag, inp->sctp_lport, stcb->rport); /* * Now restop the timers to be sure this is paranoia at is finest! */ sctp_stop_association_timers(stcb, true); /* * The chunk lists and such SHOULD be empty but we check them just * in case. */ /* anything on the wheel needs to be removed */ for (i = 0; i < asoc->streamoutcnt; i++) { struct sctp_stream_out *outs; outs = &asoc->strmout[i]; /* now clean up any chunks here */ TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { if (so) { /* Still an open socket - report */ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, (void *)sp, SCTP_SO_LOCKED); } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; sp->length = 0; } } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); } } /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) { TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp); SCTP_FREE(strrst, SCTP_M_STRESET); } TAILQ_FOREACH_SAFE(sq, &asoc->pending_reply_queue, next, nsq) { TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next); if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } sctp_free_remote_addr(sq->whoFrom); sq->whoFrom = NULL; sq->stcb = NULL; /* Free the ctl entry */ sctp_free_a_readq(stcb, sq); /* sa_ignore FREED_MEMORY */ } TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) { TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); asoc->free_chunk_cnt--; /* sa_ignore FREED_MEMORY */ } /* pending send queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* sent queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) { if (chk->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } } TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } #ifdef INVARIANTS for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].chunks_on_queues > 0) { panic("%u chunks left for stream %u.", stcb->asoc.strmout[i].chunks_on_queues, i); } } #endif /* control queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* ASCONF queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } /* the stream outs */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } asoc->strm_realoutsize = asoc->streamoutcnt = 0; if (asoc->strmin) { for (i = 0; i < asoc->streamincnt; i++) { sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); asoc->strmin = NULL; } asoc->streamincnt = 0; TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { #ifdef INVARIANTS if (SCTP_BASE_INFO(ipi_count_raddr) == 0) { panic("no net's left alloc'ed, or list points to itself"); } #endif TAILQ_REMOVE(&asoc->nets, net, sctp_next); sctp_free_remote_addr(net); } LIST_FOREACH_SAFE(laddr, &asoc->sctp_restricted_addrs, sctp_nxt_addr, naddr) { /* sa_ignore FREED_MEMORY */ sctp_remove_laddr(laddr); } /* pending asconf (address) parameters */ TAILQ_FOREACH_SAFE(aparam, &asoc->asconf_queue, next, naparam) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_queue, aparam, next); SCTP_FREE(aparam, SCTP_M_ASC_ADDR); } TAILQ_FOREACH_SAFE(aack, &asoc->asconf_ack_sent, next, naack) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next); if (aack->data != NULL) { sctp_m_freem(aack->data); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack); } /* clean up auth stuff */ if (asoc->local_hmacs) sctp_free_hmaclist(asoc->local_hmacs); if (asoc->peer_hmacs) sctp_free_hmaclist(asoc->peer_hmacs); if (asoc->local_auth_chunks) sctp_free_chunklist(asoc->local_auth_chunks); if (asoc->peer_auth_chunks) sctp_free_chunklist(asoc->peer_auth_chunks); sctp_free_authinfo(&asoc->authinfo); LIST_FOREACH_SAFE(shared_key, &asoc->shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* Insert new items here :> */ /* Get rid of LOCK */ SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_RLOCK(inp); } #ifdef SCTP_TRACK_FREED_ASOCS if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* now clean up the tasoc itself */ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); } else { LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist); } #else SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); #endif if (from_inpcbfree == SCTP_NORMAL_PROC) { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * If its NOT the inp_free calling us AND sctp_close * as been called, we call back... */ SCTP_INP_RUNLOCK(inp); /* * This will start the kill timer (if we are the * last one) since we hold an increment yet. But * this is the only safe way to do this since * otherwise if the socket closes at the same time * we are here we might collide in the cleanup. */ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, SCTP_CALLED_DIRECTLY_NOCMPSET); SCTP_INP_DECR_REF(inp); } else { /* The socket is still open. */ SCTP_INP_DECR_REF(inp); SCTP_INP_RUNLOCK(inp); } } /* destroyed the asoc */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 11); #endif return (1); } /* * determine if a destination is "reachable" based upon the addresses bound * to the current endpoint (e.g. only v4 or v6 currently bound) */ /* * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use * assoc level v4/v6 flags, as the assoc *may* not have the same address * types bound as its endpoint */ int sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr) { struct sctp_inpcb *inp; int answer; /* * No locks here, the TCB, in all cases is already locked and an * assoc is up. There is either a INP lock by the caller applied (in * asconf case when deleting an address) or NOT in the HB case, * however if HB then the INP increment is up and the INP will not * be removed (on top of the fact that we have a TCB lock). So we * only want to read the sctp_flags, which is either bound-all or * not.. no protection needed since once an assoc is up you can't be * changing your binding. */ inp = stcb->sctp_ep; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* if bound all, destination is not restricted */ /* * RRS: Question during lock work: Is this correct? If you * are bound-all you still might need to obey the V4--V6 * flags??? IMO this bound-all stuff needs to be removed! */ return (1); } /* NOTE: all "scope" checks are done when local addresses are added */ switch (destaddr->sa_family) { #ifdef INET6 case AF_INET6: answer = inp->ip_inp.inp.inp_vflag & INP_IPV6; break; #endif #ifdef INET case AF_INET: answer = inp->ip_inp.inp.inp_vflag & INP_IPV4; break; #endif default: /* invalid family, so it's unreachable */ answer = 0; break; } return (answer); } /* * update the inp_vflags on an endpoint */ static void sctp_update_ep_vflag(struct sctp_inpcb *inp) { struct sctp_laddr *laddr; /* first clear the flag */ inp->ip_inp.inp.inp_vflag = 0; /* set the flag based on addresses on the ep list */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } switch (laddr->ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } } } /* * Add the address to the endpoint local address list There is nothing to be * done if we are bound to all addresses */ void sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action) { struct sctp_laddr *laddr; struct sctp_tcb *stcb; int fnd, error = 0; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-useable addr. */ return; } } #endif /* first, is it already present? */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd == 0) { /* Not in the ep list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action); if (error != 0) return; inp->laddr_count++; /* update inp_vflag flags */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { sctp_add_local_addr_restricted(stcb, ifa); } } return; } /* * select a new (hopefully reachable) destination net (should only be used * when we deleted an ep addr that is the only usable source address to reach * the destination net) */ static void sctp_select_primary_destination(struct sctp_tcb *stcb) { struct sctp_nets *net; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* for now, we'll just pick the first reachable one we find */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) continue; if (sctp_destination_is_reachable(stcb, (struct sockaddr *)&net->ro._l_addr)) { /* found a reachable destination */ stcb->asoc.primary_destination = net; } } /* I can't there from here! ...we're gonna die shortly... */ } /* * Delete the address from the endpoint local address list. There is nothing * to be done if we are bound to all addresses */ void sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; int fnd; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd && (inp->laddr_count < 2)) { /* can't delete unless there are at LEAST 2 addresses */ return; } if (fnd) { /* * clean up any use of this address go through our * associations and clear any last_used_address that match * this one for each assoc, see if a new primary_destination * is needed */ struct sctp_tcb *stcb; /* clean up "next_addr_touse" */ if (inp->next_addr_touse == laddr) /* delete this address */ inp->next_addr_touse = NULL; /* clean up "last_used_address" */ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { struct sctp_nets *net; SCTP_TCB_LOCK(stcb); if (stcb->asoc.last_used_address == laddr) /* delete this address */ stcb->asoc.last_used_address = NULL; /* * Now spin through all the nets and purge any ref * to laddr */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._s_addr == laddr->ifa) { /* Yep, purge src address selected */ RO_NHFREE(&net->ro); sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } } SCTP_TCB_UNLOCK(stcb); } /* for each tcb */ /* remove it from the ep list */ sctp_remove_laddr(laddr); inp->laddr_count--; /* update inp_vflag flags */ sctp_update_ep_vflag(inp); } return; } /* * Add the address to the TCB local address restricted list. * This is a "pending" address list (eg. addresses waiting for an * ASCONF-ACK response) and cannot be used as a valid source address. */ void sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; struct sctpladdr *list; /* * Assumes TCB is locked.. and possibly the INP. May need to * confirm/fix that if we need it and is not the case. */ list = &stcb->asoc.sctp_restricted_addrs; #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ return; } } #endif /* does the address already exist? */ LIST_FOREACH(laddr, list, sctp_nxt_addr) { if (laddr->ifa == ifa) { return; } } /* add to the list */ (void)sctp_insert_laddr(list, ifa, 0); return; } /* * Remove a local address from the TCB local address restricted list */ void sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; /* * This is called by asconf work. It is assumed that a) The TCB is * locked and b) The INP is locked. This is true in as much as I can * trace through the entry asconf code where I did these locks. * Again, the ASCONF code is a bit different in that it does lock * the INP during its work often times. This must be since we don't * want other proc's looking up things while what they are looking * up is changing :-D */ inp = stcb->sctp_ep; /* if subset bound and don't allow ASCONF's, can't delete last */ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { if (stcb->sctp_ep->laddr_count < 2) { /* can't delete last address */ return; } } LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { /* remove the address if it exists */ if (laddr->ifa == NULL) continue; if (laddr->ifa == ifa) { sctp_remove_laddr(laddr); return; } } /* address not found! */ return; } /* sysctl */ static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC; static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR; #if defined(SCTP_MCORE_INPUT) && defined(SMP) struct sctp_mcore_ctrl *sctp_mcore_workers = NULL; int *sctp_cpuarry = NULL; void sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use) { /* Queue a packet to a processor for the specified core */ struct sctp_mcore_queue *qent; struct sctp_mcore_ctrl *wkq; int need_wake = 0; if (sctp_mcore_workers == NULL) { /* Something went way bad during setup */ sctp_input_with_port(m, off, 0); return; } SCTP_MALLOC(qent, struct sctp_mcore_queue *, (sizeof(struct sctp_mcore_queue)), SCTP_M_MCORE); if (qent == NULL) { /* This is trouble */ sctp_input_with_port(m, off, 0); return; } qent->vn = curvnet; qent->m = m; qent->off = off; qent->v6 = 0; wkq = &sctp_mcore_workers[cpu_to_use]; SCTP_MCORE_QLOCK(wkq); TAILQ_INSERT_TAIL(&wkq->que, qent, next); if (wkq->running == 0) { need_wake = 1; } SCTP_MCORE_QUNLOCK(wkq); if (need_wake) { wakeup(&wkq->running); } } static void sctp_mcore_thread(void *arg) { struct sctp_mcore_ctrl *wkq; struct sctp_mcore_queue *qent; wkq = (struct sctp_mcore_ctrl *)arg; struct mbuf *m; int off, v6; /* Wait for first tickle */ SCTP_MCORE_LOCK(wkq); wkq->running = 0; msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); SCTP_MCORE_UNLOCK(wkq); /* Bind to our cpu */ thread_lock(curthread); sched_bind(curthread, wkq->cpuid); thread_unlock(curthread); /* Now lets start working */ SCTP_MCORE_LOCK(wkq); /* Now grab lock and go */ for (;;) { SCTP_MCORE_QLOCK(wkq); skip_sleep: wkq->running = 1; qent = TAILQ_FIRST(&wkq->que); if (qent) { TAILQ_REMOVE(&wkq->que, qent, next); SCTP_MCORE_QUNLOCK(wkq); CURVNET_SET(qent->vn); m = qent->m; off = qent->off; v6 = qent->v6; SCTP_FREE(qent, SCTP_M_MCORE); if (v6 == 0) { sctp_input_with_port(m, off, 0); } else { SCTP_PRINTF("V6 not yet supported\n"); sctp_m_freem(m); } CURVNET_RESTORE(); SCTP_MCORE_QLOCK(wkq); } wkq->running = 0; if (!TAILQ_EMPTY(&wkq->que)) { goto skip_sleep; } SCTP_MCORE_QUNLOCK(wkq); msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); } } static void sctp_startup_mcore_threads(void) { int i, cpu; if (mp_ncpus == 1) return; if (sctp_mcore_workers != NULL) { /* * Already been here in some previous vnet? */ return; } SCTP_MALLOC(sctp_mcore_workers, struct sctp_mcore_ctrl *, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl)), SCTP_M_MCORE); if (sctp_mcore_workers == NULL) { /* TSNH I hope */ return; } memset(sctp_mcore_workers, 0, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl))); /* Init the structures */ for (i = 0; i <= mp_maxid; i++) { TAILQ_INIT(&sctp_mcore_workers[i].que); SCTP_MCORE_LOCK_INIT(&sctp_mcore_workers[i]); SCTP_MCORE_QLOCK_INIT(&sctp_mcore_workers[i]); sctp_mcore_workers[i].cpuid = i; } if (sctp_cpuarry == NULL) { SCTP_MALLOC(sctp_cpuarry, int *, (mp_ncpus * sizeof(int)), SCTP_M_MCORE); i = 0; CPU_FOREACH(cpu) { sctp_cpuarry[i] = cpu; i++; } } /* Now start them all */ CPU_FOREACH(cpu) { (void)kproc_create(sctp_mcore_thread, (void *)&sctp_mcore_workers[cpu], &sctp_mcore_workers[cpu].thread_proc, 0, SCTP_KTHREAD_PAGES, SCTP_MCORE_NAME); } } #endif void sctp_pcb_init(void) { /* * SCTP initialization for the PCB structures should be called by * the sctp_init() function. */ int i; struct timeval tv; if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) { /* error I was called twice */ return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 1; #if defined(SCTP_LOCAL_TRACE_BUF) memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); #endif #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_MALLOC(SCTP_BASE_STATS, struct sctpstat *, ((mp_maxid + 1) * sizeof(struct sctpstat)), SCTP_M_MCORE); #endif (void)SCTP_GETTIME_TIMEVAL(&tv); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) memset(SCTP_BASE_STATS, 0, sizeof(struct sctpstat) * (mp_maxid + 1)); SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t)tv.tv_usec; #else memset(&SCTP_BASE_STATS, 0, sizeof(struct sctpstat)); SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t)tv.tv_usec; #endif /* init the empty list of (All) Endpoints */ LIST_INIT(&SCTP_BASE_INFO(listhead)); /* init the hash table of endpoints */ TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale)); SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31), &SCTP_BASE_INFO(hashasocmark)); SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashmark)); SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashtcpmark)); SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize); SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH, &SCTP_BASE_INFO(hashvrfmark)); SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE, &SCTP_BASE_INFO(vrf_ifn_hashmark)); /* init the zones */ /* * FIX ME: Should check for NULL returns, but if it does fail we are * doomed to panic anyways... add later maybe. */ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep", sizeof(struct sctp_inpcb), maxsockets); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc", sizeof(struct sctp_tcb), sctp_max_number_of_assoc); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr", sizeof(struct sctp_laddr), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr", sizeof(struct sctp_nets), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk", sizeof(struct sctp_tmit_chunk), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq", sizeof(struct sctp_queued_to_read), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out", sizeof(struct sctp_stream_queue_pending), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf", sizeof(struct sctp_asconf), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack", sizeof(struct sctp_asconf_ack), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); /* Master Lock INIT for info structure */ SCTP_INP_INFO_LOCK_INIT(); SCTP_STATLOG_INIT_LOCK(); SCTP_IPI_COUNT_INIT(); SCTP_IPI_ADDR_INIT(); #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_INIT(); #endif LIST_INIT(&SCTP_BASE_INFO(addr_wq)); SCTP_WQ_ADDR_INIT(); /* not sure if we need all the counts */ SCTP_BASE_INFO(ipi_count_ep) = 0; /* assoc/tcb zone info */ SCTP_BASE_INFO(ipi_count_asoc) = 0; /* local addrlist zone info */ SCTP_BASE_INFO(ipi_count_laddr) = 0; /* remote addrlist zone info */ SCTP_BASE_INFO(ipi_count_raddr) = 0; /* chunk info */ SCTP_BASE_INFO(ipi_count_chunk) = 0; /* socket queue zone info */ SCTP_BASE_INFO(ipi_count_readq) = 0; /* stream out queue cont */ SCTP_BASE_INFO(ipi_count_strmoq) = 0; SCTP_BASE_INFO(ipi_free_strmoq) = 0; SCTP_BASE_INFO(ipi_free_chunks) = 0; SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer)); /* Init the TIMEWAIT list */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]); } sctp_startup_iterator(); #if defined(SCTP_MCORE_INPUT) && defined(SMP) sctp_startup_mcore_threads(); #endif /* * INIT the default VRF which for BSD is the only one, other O/S's * may have more. But initially they must start with one and then * add the VRF's as addresses are added. */ sctp_init_vrf_list(SCTP_DEFAULT_VRF); } /* * Assumes that the SCTP_BASE_INFO() lock is NOT held. */ void sctp_pcb_finish(void) { struct sctp_vrflist *vrf_bucket; struct sctp_vrf *vrf, *nvrf; struct sctp_ifn *ifn, *nifn; struct sctp_ifa *ifa, *nifa; struct sctpvtaghead *chain; struct sctp_tagblock *twait_block, *prev_twait_block; struct sctp_laddr *wi, *nwi; int i; struct sctp_iterator *it, *nit; if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: race condition on teardown.\n", __func__); return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 0; /* * In FreeBSD the iterator thread never exits but we do clean up. * The only way FreeBSD reaches here is if we have VRF's but we * still add the ifdef to make it compile on old versions. */ retry: SCTP_IPI_ITERATOR_WQ_LOCK(); /* * sctp_iterator_worker() might be working on an it entry without * holding the lock. We won't find it on the list either and * continue and free/destroy it. While holding the lock, spin, to * avoid the race condition as sctp_iterator_worker() will have to * wait to re-acquire the lock. */ if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. " "cur_it=%p\n", __func__, sctp_it_ctl.cur_it); DELAY(10); goto retry; } TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_ITERATOR_LOCK(); if ((sctp_it_ctl.cur_it) && (sctp_it_ctl.cur_it->vn == curvnet)) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } SCTP_ITERATOR_UNLOCK(); SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer)); SCTP_WQ_ADDR_LOCK(); LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) { LIST_REMOVE(wi, sctp_nxt_addr); SCTP_DECR_LADDR_COUNT(); if (wi->action == SCTP_DEL_IP_ADDRESS) { SCTP_FREE(wi->ifa, SCTP_M_IFA); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi); } SCTP_WQ_ADDR_UNLOCK(); /* * free the vrf/ifn/ifa lists and hashes (be sure address monitor is * destroyed first). */ SCTP_IPI_ADDR_WLOCK(); vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH_SAFE(vrf, vrf_bucket, next_vrf, nvrf) { LIST_FOREACH_SAFE(ifn, &vrf->ifnlist, next_ifn, nifn) { LIST_FOREACH_SAFE(ifa, &ifn->ifalist, next_ifa, nifa) { /* free the ifa */ LIST_REMOVE(ifa, next_bucket); LIST_REMOVE(ifa, next_ifa); SCTP_FREE(ifa, SCTP_M_IFA); } /* free the ifn */ LIST_REMOVE(ifn, next_bucket); LIST_REMOVE(ifn, next_ifn); SCTP_FREE(ifn, SCTP_M_IFN); } SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); /* free the vrf */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); } SCTP_IPI_ADDR_WUNLOCK(); /* free the vrf hashes */ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark)); SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark)); /* * free the TIMEWAIT list elements malloc'd in the function * sctp_add_vtag_to_timewait()... */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { chain = &SCTP_BASE_INFO(vtag_timewait)[i]; if (!LIST_EMPTY(chain)) { prev_twait_block = NULL; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { if (prev_twait_block) { SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } prev_twait_block = twait_block; } SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } } /* free the locks and mutexes */ #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_DESTROY(); #endif SCTP_IPI_ADDR_DESTROY(); SCTP_STATLOG_DESTROY(); SCTP_INP_INFO_LOCK_DESTROY(); SCTP_WQ_ADDR_DESTROY(); /* Get rid of other stuff too. */ if (SCTP_BASE_INFO(sctp_asochash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); if (SCTP_BASE_INFO(sctp_ephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack)); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE); #endif } int sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, int offset, int limit, struct sockaddr *src, struct sockaddr *dst, struct sockaddr *altsa, uint16_t port) { /* * grub through the INIT pulling addresses and loading them to the * nets structure in the asoc. The from address in the mbuf should * also be loaded (if it is not already). This routine can be called * with either INIT or INIT-ACK's as long as the m points to the IP * packet and the offset points to the beginning of the parameters. */ struct sctp_inpcb *inp; struct sctp_nets *net, *nnet, *net_tmp; struct sctp_paramhdr *phdr, param_buf; struct sctp_tcb *stcb_tmp; uint16_t ptype, plen; struct sockaddr *sa; uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_random *p_random = NULL; uint16_t random_len = 0; uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs = NULL; uint16_t hmacs_len = 0; uint8_t saw_asconf = 0; uint8_t saw_asconf_ack = 0; uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_chunk_list *chunks = NULL; uint16_t num_chunks = 0; sctp_key_t *new_key; uint32_t keylen; int got_random = 0, got_hmacs = 0, got_chklist = 0; uint8_t peer_supports_ecn; uint8_t peer_supports_prsctp; uint8_t peer_supports_auth; uint8_t peer_supports_asconf; uint8_t peer_supports_asconf_ack; uint8_t peer_supports_reconfig; uint8_t peer_supports_nrsack; uint8_t peer_supports_pktdrop; uint8_t peer_supports_idata; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* First get the destination address setup too. */ #ifdef INET memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = stcb->rport; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; #endif if (altsa) { sa = altsa; } else { sa = src; } peer_supports_idata = 0; peer_supports_ecn = 0; peer_supports_prsctp = 0; peer_supports_auth = 0; peer_supports_asconf = 0; peer_supports_asconf_ack = 0; peer_supports_reconfig = 0; peer_supports_nrsack = 0; peer_supports_pktdrop = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* mark all addresses that we have currently on the list */ net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC; } /* does the source address already exist? if so skip it */ inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* no scope set here since we have a tcb already. */ switch (sa->sa_family) { #ifdef INET case AF_INET: if (stcb->asoc.scope.ipv4_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { return (-1); } } break; #endif #ifdef INET6 case AF_INET6: if (stcb->asoc.scope.ipv6_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { return (-2); } } break; #endif default: break; } } else { if (net_tmp != NULL && stcb_tmp == stcb) { net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } else if (stcb_tmp != stcb) { /* It belongs to another association? */ if (stcb_tmp) SCTP_TCB_UNLOCK(stcb_tmp); return (-3); } } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-4); } /* now we must go through each of the params. */ phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); /* * SCTP_PRINTF("ptype => %0x, plen => %d\n", * (uint32_t)ptype, (int)plen); */ if (offset + plen > limit) { break; } if (plen < sizeof(struct sctp_paramhdr)) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS) { if (stcb->asoc.scope.ipv4_addr_legal) { struct sctp_ipv4addr_param *p4, p4_buf; /* ok get the v4 address and check/add */ phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); if (plen != sizeof(struct sctp_ipv4addr_param) || phdr == NULL) { return (-5); } p4 = (struct sctp_ipv4addr_param *)phdr; sin.sin_addr.s_addr = p4->addr; if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { /* Skip multi-cast addresses */ goto next_param; } if ((sin.sin_addr.s_addr == INADDR_BROADCAST) || (sin.sin_addr.s_addr == INADDR_ANY)) { goto next_param; } sa = (struct sockaddr *)&sin; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* * no scope set since we have a tcb * already */ /* * we must validate the state again * here */ add_it_now: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-7); } if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { return (-8); } } else if (stcb_tmp == stcb) { if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-10); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, false, SCTP_SO_NOT_LOCKED); goto add_it_now; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-12); } return (-13); } } } else #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS) { if (stcb->asoc.scope.ipv6_addr_legal) { /* ok get the v6 address and check/add */ struct sctp_ipv6addr_param *p6, p6_buf; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); if (plen != sizeof(struct sctp_ipv6addr_param) || phdr == NULL) { return (-14); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy((caddr_t)&sin6.sin6_addr, p6->addr, sizeof(p6->addr)); if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { /* Skip multi-cast addresses */ goto next_param; } if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { /* * Link local make no sense without * scope */ goto next_param; } sa = (struct sockaddr *)&sin6; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb_tmp == NULL && (inp == stcb->sctp_ep || inp == NULL)) { /* * we must validate the state again * here */ add_it_now6: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-16); } /* * we must add the address, no scope * set */ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { return (-17); } } else if (stcb_tmp == stcb) { /* * we must validate the state again * here */ if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-19); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, false, SCTP_SO_NOT_LOCKED); goto add_it_now6; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-21); } return (-22); } } } else #endif if (ptype == SCTP_ECN_CAPABLE) { peer_supports_ecn = 1; } else if (ptype == SCTP_ULP_ADAPTATION) { if (stcb->asoc.state != SCTP_STATE_OPEN) { struct sctp_adaptation_layer_indication ai, *aip; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ai, sizeof(ai)); aip = (struct sctp_adaptation_layer_indication *)phdr; if (aip) { stcb->asoc.peers_adaptation = ntohl(aip->indication); stcb->asoc.adaptation_needed = 1; } } } else if (ptype == SCTP_SET_PRIM_ADDR) { struct sctp_asconf_addr_param lstore, *fee; int lptype; struct sockaddr *lsa = NULL; #ifdef INET struct sctp_asconf_addrv4_param *fii; #endif if (stcb->asoc.asconf_supported == 0) { return (-100); } if (plen > sizeof(lstore)) { return (-23); } if (plen < sizeof(struct sctp_asconf_addrv4_param)) { return (-101); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&lstore, plen); if (phdr == NULL) { return (-24); } fee = (struct sctp_asconf_addr_param *)phdr; lptype = ntohs(fee->addrp.ph.param_type); switch (lptype) { #ifdef INET case SCTP_IPV4_ADDRESS: if (plen != sizeof(struct sctp_asconf_addrv4_param)) { SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addrv4_param), plen); } else { fii = (struct sctp_asconf_addrv4_param *)fee; sin.sin_addr.s_addr = fii->addrp.addr; lsa = (struct sockaddr *)&sin; } break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (plen != sizeof(struct sctp_asconf_addr_param)) { SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addr_param), plen); } else { memcpy(sin6.sin6_addr.s6_addr, fee->addrp.addr, sizeof(fee->addrp.addr)); lsa = (struct sockaddr *)&sin6; } break; #endif default: break; } if (lsa) { (void)sctp_set_primary_addr(stcb, sa, NULL); } } else if (ptype == SCTP_HAS_NAT_SUPPORT) { stcb->asoc.peer_supports_nat = 1; } else if (ptype == SCTP_PRSCTP_SUPPORTED) { /* Peer supports pr-sctp */ peer_supports_prsctp = 1; } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; int num_ent, i; if (plen > sizeof(local_store)) { return (-35); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&local_store, plen); if (phdr == NULL) { return (-25); } pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: peer_supports_asconf = 1; break; case SCTP_ASCONF_ACK: peer_supports_asconf_ack = 1; break; case SCTP_FORWARD_CUM_TSN: peer_supports_prsctp = 1; break; case SCTP_PACKET_DROPPED: peer_supports_pktdrop = 1; break; case SCTP_NR_SELECTIVE_ACK: peer_supports_nrsack = 1; break; case SCTP_STREAM_RESET: peer_supports_reconfig = 1; break; case SCTP_AUTHENTICATION: peer_supports_auth = 1; break; case SCTP_IDATA: peer_supports_idata = 1; break; default: /* one I have not learned yet */ break; } } } else if (ptype == SCTP_RANDOM) { if (plen > sizeof(random_store)) break; if (got_random) { /* already processed a RANDOM */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)random_store, plen); if (phdr == NULL) return (-26); p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); /* enforce the random length */ if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); return (-27); } got_random = 1; } else if (ptype == SCTP_HMAC_LIST) { uint16_t num_hmacs; uint16_t i; if (plen > sizeof(hmacs_store)) break; if (got_hmacs) { /* already processed a HMAC list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)hmacs_store, plen); if (phdr == NULL) return (-28); hmacs = (struct sctp_auth_hmac_algo *)phdr; hmacs_len = plen - sizeof(*hmacs); num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); /* validate the hmac list */ if (sctp_verify_hmac_param(hmacs, num_hmacs)) { return (-29); } if (stcb->asoc.peer_hmacs != NULL) sctp_free_hmaclist(stcb->asoc.peer_hmacs); stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs); if (stcb->asoc.peer_hmacs != NULL) { for (i = 0; i < num_hmacs; i++) { (void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs, ntohs(hmacs->hmac_ids[i])); } } got_hmacs = 1; } else if (ptype == SCTP_CHUNK_LIST) { int i; if (plen > sizeof(chunks_store)) break; if (got_chklist) { /* already processed a Chunks list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, plen); if (phdr == NULL) return (-30); chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); if (stcb->asoc.peer_auth_chunks != NULL) sctp_clear_chunklist(stcb->asoc.peer_auth_chunks); else stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist(); for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(chunks->chunk_types[i], stcb->asoc.peer_auth_chunks); /* record asconf/asconf-ack if listed */ if (chunks->chunk_types[i] == SCTP_ASCONF) saw_asconf = 1; if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) saw_asconf_ack = 1; } got_chklist = 1; } else if ((ptype == SCTP_HEARTBEAT_INFO) || (ptype == SCTP_STATE_COOKIE) || (ptype == SCTP_UNRECOG_PARAM) || (ptype == SCTP_COOKIE_PRESERVE) || (ptype == SCTP_SUPPORTED_ADDRTYPE) || (ptype == SCTP_ADD_IP_ADDRESS) || (ptype == SCTP_DEL_IP_ADDRESS) || (ptype == SCTP_ERROR_CAUSE_IND) || (ptype == SCTP_SUCCESS_REPORT)) { /* don't care */ } else { if ((ptype & 0x8000) == 0x0000) { /* * must stop processing the rest of the * param's. Any report bits were handled * with the call to * sctp_arethere_unrecognized_parameters() * when the INIT or INIT-ACK was first seen. */ break; } } next_param: offset += SCTP_SIZE32(plen); if (offset >= limit) { break; } phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } /* Now check to see if we need to purge any addresses */ TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) { if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) == SCTP_ADDR_NOT_IN_ASSOC) { /* This address has been removed from the asoc */ /* remove and free it */ stcb->asoc.numnets--; TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next); if (net == stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } if (net == stcb->asoc.primary_destination) { stcb->asoc.primary_destination = NULL; sctp_select_primary_destination(stcb); } sctp_free_remote_addr(net); } } if ((stcb->asoc.ecn_supported == 1) && (peer_supports_ecn == 0)) { stcb->asoc.ecn_supported = 0; } if ((stcb->asoc.prsctp_supported == 1) && (peer_supports_prsctp == 0)) { stcb->asoc.prsctp_supported = 0; } if ((stcb->asoc.auth_supported == 1) && ((peer_supports_auth == 0) || (got_random == 0) || (got_hmacs == 0))) { stcb->asoc.auth_supported = 0; } if ((stcb->asoc.asconf_supported == 1) && ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) || (stcb->asoc.auth_supported == 0) || (saw_asconf == 0) || (saw_asconf_ack == 0))) { stcb->asoc.asconf_supported = 0; } if ((stcb->asoc.reconfig_supported == 1) && (peer_supports_reconfig == 0)) { stcb->asoc.reconfig_supported = 0; } if ((stcb->asoc.idata_supported == 1) && (peer_supports_idata == 0)) { stcb->asoc.idata_supported = 0; } if ((stcb->asoc.nrsack_supported == 1) && (peer_supports_nrsack == 0)) { stcb->asoc.nrsack_supported = 0; } if ((stcb->asoc.pktdrop_supported == 1) && (peer_supports_pktdrop == 0)) { stcb->asoc.pktdrop_supported = 0; } /* validate authentication required parameters */ if ((peer_supports_auth == 0) && (got_chklist == 1)) { /* peer does not support auth but sent a chunks list? */ return (-31); } if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) { /* peer supports asconf but not auth? */ return (-32); } else if ((peer_supports_asconf == 1) && (peer_supports_auth == 1) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-33); } /* concatenate the full random key */ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; if (chunks != NULL) { keylen += sizeof(*chunks) + num_chunks; } new_key = sctp_alloc_key(keylen); if (new_key != NULL) { /* copy in the RANDOM */ if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); } else { keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { memcpy(new_key->key + keylen, chunks, sizeof(*chunks) + num_chunks); keylen += sizeof(*chunks) + num_chunks; } /* append in the HMACs */ if (hmacs != NULL) { memcpy(new_key->key + keylen, hmacs, sizeof(*hmacs) + hmacs_len); } } else { /* failed to get memory for the key */ return (-34); } if (stcb->asoc.authinfo.peer_random != NULL) sctp_free_key(stcb->asoc.authinfo.peer_random); stcb->asoc.authinfo.peer_random = new_key; sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); return (0); } int sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa, struct sctp_nets *net) { /* make sure the requested primary address exists in the assoc */ if (net == NULL && sa) net = sctp_findnet(stcb, sa); if (net == NULL) { /* didn't find the requested primary address! */ return (-1); } else { /* set the primary address */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* Must be confirmed, so queue to set */ net->dest_state |= SCTP_ADDR_REQ_PRIMARY; return (0); } stcb->asoc.primary_destination = net; if (((net->dest_state & SCTP_ADDR_PF) == 0) && (stcb->asoc.alternate != NULL)) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } net = TAILQ_FIRST(&stcb->asoc.nets); if (net != stcb->asoc.primary_destination) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficient if the * primary is the first on the list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } } bool sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now) { struct sctpasochead *head; struct sctp_tcb *stcb; SCTP_INP_INFO_LOCK_ASSERT(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { /* * We choose not to lock anything here. TCB's can't be * removed since we have the read lock, so they can't be * freed on us, same thing for the INP. I may be wrong with * this assumption, but we will go with it for now :-) */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (stcb->asoc.my_vtag == tag) { /* candidate */ if (stcb->rport != rport) { continue; } if (stcb->sctp_ep->sctp_lport != lport) { continue; } /* The tag is currently used, so don't use it. */ return (false); } } return (!sctp_is_in_timewait(tag, lport, rport, (uint32_t)now->tv_sec)); } static void sctp_drain_mbufs(struct sctp_tcb *stcb) { /* * We must hunt this association for MBUF's past the cumack (i.e. * out of order data that we can renege on). */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *nchk; uint32_t cumulative_tsn_p1; struct sctp_queued_to_read *control, *ncontrol; int cnt, strmat; uint32_t gap, i; int fnd = 0; /* We look for anything larger than the cum-ack + 1 */ asoc = &stcb->asoc; if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) { /* none we can reneg on. */ return; } SCTP_STAT_INCR(sctps_protocol_drains_done); cumulative_tsn_p1 = asoc->cumulative_tsn + 1; cnt = 0; /* Ok that was fun, now we will drain all the inbound streams? */ for (strmat = 0; strmat < asoc->streamincnt; strmat++) { TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_ORDERED) { panic("Huh control: %p on_q: %d -- not ordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].uno_inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_UNORDERED) { panic("Huh control: %p on_q: %d -- not unordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } } if (cnt) { /* We must back down to see what the new highest is */ for (i = asoc->highest_tsn_inside_map; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) { SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { asoc->highest_tsn_inside_map = i; fnd = 1; break; } } if (!fnd) { asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; } /* * Question, should we go through the delivery queue? The * only reason things are on here is the app not reading OR * a p-d-api up. An attacker COULD send enough in to * initiate the PD-API and then send a bunch of stuff to * other streams... these would wind up on the delivery * queue.. and then we would not get to them. But in order * to do this I then have to back-track and un-deliver * sequence numbers in streams.. el-yucko. I think for now * we will NOT look at the delivery queue and leave it to be * something to consider later. An alternative would be to * abort the P-D-API with a notification and then deliver * the data.... Or another method might be to keep track of * how many times the situation occurs and if we see a * possible attack underway just abort the association. */ #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt); #endif /* * Now do we need to find a new * asoc->highest_tsn_inside_map? */ asoc->last_revoke_count = cnt; sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_11); /* sa_ignore NO_NULL_CHK */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED); } /* * Another issue, in un-setting the TSN's in the mapping array we * DID NOT adjust the highest_tsn marker. This will cause one of * two things to occur. It may cause us to do extra work in checking * for our mapping array movement. More importantly it may cause us * to SACK every datagram. This may not be a bad thing though since * we will recover once we get our cum-ack above and all this stuff * we dumped recovered. */ } -void +static void sctp_drain(void) { + struct epoch_tracker et; + VNET_ITERATOR_DECL(vnet_iter); + + NET_EPOCH_ENTER(et); /* * We must walk the PCB lists for ALL associations here. The system * is LOW on MBUF's and needs help. This is where reneging will * occur. We really hope this does NOT happen! */ - VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct sctp_inpcb *inp; struct sctp_tcb *stcb; SCTP_STAT_INCR(sctps_protocol_drain_calls); if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { #ifdef VIMAGE continue; #else + NET_EPOCH_EXIT(et); return; #endif } SCTP_INP_INFO_RLOCK(); LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { /* For each endpoint */ SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { /* For each association */ SCTP_TCB_LOCK(stcb); sctp_drain_mbufs(stcb); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } SCTP_INP_INFO_RUNLOCK(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); } +EVENTHANDLER_DEFINE(vm_lowmem, sctp_drain, NULL, LOWMEM_PRI_DEFAULT); +EVENTHANDLER_DEFINE(mbuf_lowmem, sctp_drain, NULL, LOWMEM_PRI_DEFAULT); /* * start a new iterator * iterates through all endpoints and associations based on the pcb_state * flags and asoc_state. "af" (mandatory) is executed for all matching * assocs and "ef" (optional) is executed when the iterator completes. * "inpf" (optional) is executed for each new endpoint as it is being * iterated through. inpe (optional) is called when the inp completes * its way through all the stcbs. */ int sctp_initiate_iterator(inp_func inpf, asoc_func af, inp_func inpe, uint32_t pcb_state, uint32_t pcb_features, uint32_t asoc_state, void *argp, uint32_t argi, end_func ef, struct sctp_inpcb *s_inp, uint8_t chunk_output_off) { struct sctp_iterator *it = NULL; if (af == NULL) { return (-1); } if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: abort on initialize being %d\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized)); return (-1); } SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator), SCTP_M_ITER); if (it == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); return (-1); } memset(it, 0, sizeof(*it)); it->function_assoc = af; it->function_inp = inpf; if (inpf) it->done_current_ep = 0; else it->done_current_ep = 1; it->function_atend = ef; it->pointer = argp; it->val = argi; it->pcb_flags = pcb_state; it->pcb_features = pcb_features; it->asoc_state = asoc_state; it->function_inp_end = inpe; it->no_chunk_output = chunk_output_off; it->vn = curvnet; if (s_inp) { /* Assume lock is held here */ it->inp = s_inp; SCTP_INP_INCR_REF(it->inp); it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP; } else { SCTP_INP_INFO_RLOCK(); it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead)); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } SCTP_INP_INFO_RUNLOCK(); it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP; } SCTP_IPI_ITERATOR_WQ_LOCK(); if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized), it); SCTP_FREE(it, SCTP_M_ITER); return (-1); } TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (sctp_it_ctl.iterator_running == 0) { sctp_wakeup_iterator(); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); /* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */ return (0); } /* * Atomically add flags to the sctp_flags of an inp. * To be used when the write lock of the inp is not held. */ void sctp_pcb_add_flags(struct sctp_inpcb *inp, uint32_t flags) { uint32_t old_flags, new_flags; do { old_flags = inp->sctp_flags; new_flags = old_flags | flags; } while (atomic_cmpset_int(&inp->sctp_flags, old_flags, new_flags) == 0); } diff --git a/sys/netinet/sctp_pcb.h b/sys/netinet/sctp_pcb.h index 687ccf6a1c50..fd8115a8101a 100644 --- a/sys/netinet/sctp_pcb.h +++ b/sys/netinet/sctp_pcb.h @@ -1,646 +1,644 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_PCB_H_ #define _NETINET_SCTP_PCB_H_ #include #include #include #include LIST_HEAD(sctppcbhead, sctp_inpcb); LIST_HEAD(sctpasochead, sctp_tcb); LIST_HEAD(sctpladdr, sctp_laddr); LIST_HEAD(sctpvtaghead, sctp_tagblock); LIST_HEAD(sctp_vrflist, sctp_vrf); LIST_HEAD(sctp_ifnlist, sctp_ifn); LIST_HEAD(sctp_ifalist, sctp_ifa); TAILQ_HEAD(sctp_readhead, sctp_queued_to_read); TAILQ_HEAD(sctp_streamhead, sctp_stream_queue_pending); #include #include #define SCTP_PCBHASH_ALLADDR(port, mask) (port & mask) #define SCTP_PCBHASH_ASOC(tag, mask) (tag & mask) struct sctp_vrf { LIST_ENTRY(sctp_vrf) next_vrf; struct sctp_ifalist *vrf_addr_hash; struct sctp_ifnlist ifnlist; uint32_t vrf_id; uint32_t tbl_id_v4; /* default v4 table id */ uint32_t tbl_id_v6; /* default v6 table id */ uint32_t total_ifa_count; u_long vrf_addr_hashmark; uint32_t refcount; }; struct sctp_ifn { struct sctp_ifalist ifalist; struct sctp_vrf *vrf; LIST_ENTRY(sctp_ifn) next_ifn; LIST_ENTRY(sctp_ifn) next_bucket; void *ifn_p; /* never access without appropriate lock */ uint32_t ifn_mtu; uint32_t ifn_type; uint32_t ifn_index; /* shorthand way to look at ifn for reference */ uint32_t refcount; /* number of reference held should be >= * ifa_count */ uint32_t ifa_count; /* IFA's we hold (in our list - ifalist) */ uint32_t num_v6; /* number of v6 addresses */ uint32_t num_v4; /* number of v4 addresses */ uint32_t registered_af; /* registered address family for i/f events */ char ifn_name[SCTP_IFNAMSIZ]; }; /* SCTP local IFA flags */ #define SCTP_ADDR_VALID 0x00000001 /* its up and active */ #define SCTP_BEING_DELETED 0x00000002 /* being deleted, when * refcount = 0. Note that it * is pulled from the ifn list * and ifa_p is nulled right * away but it cannot be freed * until the last *net * pointing to it is deleted. */ #define SCTP_ADDR_DEFER_USE 0x00000004 /* Hold off using this one */ #define SCTP_ADDR_IFA_UNUSEABLE 0x00000008 struct sctp_ifa { LIST_ENTRY(sctp_ifa) next_ifa; LIST_ENTRY(sctp_ifa) next_bucket; struct sctp_ifn *ifn_p; /* back pointer to parent ifn */ void *ifa; /* pointer to ifa, needed for flag update for * that we MUST lock appropriate locks. This * is for V6. */ union sctp_sockstore address; uint32_t refcount; /* number of folks referring to this */ uint32_t flags; uint32_t localifa_flags; uint32_t vrf_id; /* vrf_id of this addr (for deleting) */ uint8_t src_is_loop; uint8_t src_is_priv; uint8_t src_is_glob; uint8_t resv; }; struct sctp_laddr { LIST_ENTRY(sctp_laddr) sctp_nxt_addr; /* next in list */ struct sctp_ifa *ifa; uint32_t action; /* Used during asconf and adding if no-zero * src-addr selection will not consider this * address. */ struct timeval start_time; /* time when this address was created */ }; struct sctp_block_entry { int error; }; struct sctp_timewait { uint32_t tv_sec_at_expire; /* the seconds from boot to expire */ uint32_t v_tag; /* the vtag that can not be reused */ uint16_t lport; /* the local port used in vtag */ uint16_t rport; /* the remote port used in vtag */ }; struct sctp_tagblock { LIST_ENTRY(sctp_tagblock) sctp_nxt_tagblock; struct sctp_timewait vtag_block[SCTP_NUMBER_IN_VTAG_BLOCK]; }; struct sctp_epinfo { #ifdef INET struct socket *udp4_tun_socket; #endif #ifdef INET6 struct socket *udp6_tun_socket; #endif struct sctpasochead *sctp_asochash; u_long hashasocmark; struct sctppcbhead *sctp_ephash; u_long hashmark; /*- * The TCP model represents a substantial overhead in that we get an * additional hash table to keep explicit connections in. The * listening TCP endpoint will exist in the usual ephash above and * accept only INIT's. It will be incapable of sending off an INIT. * When a dg arrives we must look in the normal ephash. If we find a * TCP endpoint that will tell us to go to the specific endpoint * hash and re-hash to find the right assoc/socket. If we find a UDP * model socket we then must complete the lookup. If this fails, * i.e. no association can be found then we must continue to see if * a sctp_peeloff()'d socket is in the tcpephash (a spun off socket * acts like a TCP model connected socket). */ struct sctppcbhead *sctp_tcpephash; u_long hashtcpmark; uint32_t hashtblsize; struct sctp_vrflist *sctp_vrfhash; u_long hashvrfmark; struct sctp_ifnlist *vrf_ifn_hash; u_long vrf_ifn_hashmark; struct sctppcbhead listhead; struct sctpladdr addr_wq; /* ep zone info */ sctp_zone_t ipi_zone_ep; sctp_zone_t ipi_zone_asoc; sctp_zone_t ipi_zone_laddr; sctp_zone_t ipi_zone_net; sctp_zone_t ipi_zone_chunk; sctp_zone_t ipi_zone_readq; sctp_zone_t ipi_zone_strmoq; sctp_zone_t ipi_zone_asconf; sctp_zone_t ipi_zone_asconf_ack; struct rwlock ipi_ep_mtx; struct mtx ipi_iterator_wq_mtx; struct rwlock ipi_addr_mtx; struct mtx ipi_pktlog_mtx; struct mtx wq_addr_mtx; uint32_t ipi_count_ep; /* assoc/tcb zone info */ uint32_t ipi_count_asoc; /* local addrlist zone info */ uint32_t ipi_count_laddr; /* remote addrlist zone info */ uint32_t ipi_count_raddr; /* chunk structure list for output */ uint32_t ipi_count_chunk; /* socket queue zone info */ uint32_t ipi_count_readq; /* socket queue zone info */ uint32_t ipi_count_strmoq; /* Number of vrfs */ uint32_t ipi_count_vrfs; /* Number of ifns */ uint32_t ipi_count_ifns; /* Number of ifas */ uint32_t ipi_count_ifas; /* system wide number of free chunks hanging around */ uint32_t ipi_free_chunks; uint32_t ipi_free_strmoq; struct sctpvtaghead vtag_timewait[SCTP_STACK_VTAG_HASH_SIZE]; /* address work queue handling */ struct sctp_timer addr_wq_timer; }; struct sctp_base_info { /* * All static structures that anchor the system must be here. */ struct sctp_epinfo sctppcbinfo; #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) struct sctpstat *sctpstat; #else struct sctpstat sctpstat; #endif struct sctp_sysctl sctpsysctl; uint8_t first_time; char sctp_pcb_initialized; #if defined(SCTP_PACKET_LOGGING) int packet_log_writers; int packet_log_end; uint8_t packet_log_buffer[SCTP_PACKET_LOG_SIZE]; #endif eventhandler_tag eh_tag; }; /*- * Here we have all the relevant information for each SCTP entity created. We * will need to modify this as approprate. We also need to figure out how to * access /dev/random. */ struct sctp_pcb { unsigned int time_of_secret_change; /* number of seconds from * timeval.tv_sec */ uint32_t secret_key[SCTP_HOW_MANY_SECRETS][SCTP_NUMBER_OF_SECRETS]; unsigned int size_of_a_cookie; uint32_t sctp_timeoutticks[SCTP_NUM_TMRS]; uint32_t sctp_minrto; uint32_t sctp_maxrto; uint32_t initial_rto; uint32_t initial_init_rto_max; unsigned int sctp_sack_freq; uint32_t sctp_sws_sender; uint32_t sctp_sws_receiver; uint32_t sctp_default_cc_module; uint32_t sctp_default_ss_module; /* authentication related fields */ struct sctp_keyhead shared_keys; sctp_auth_chklist_t *local_auth_chunks; sctp_hmaclist_t *local_hmacs; uint16_t default_keyid; uint32_t default_mtu; /* various thresholds */ /* Max times I will init at a guy */ uint16_t max_init_times; /* Max times I will send before we consider someone dead */ uint16_t max_send_times; uint16_t def_net_failure; uint16_t def_net_pf_threshold; /* number of streams to pre-open on a association */ uint16_t pre_open_stream_count; uint16_t max_open_streams_intome; /* random number generator */ uint32_t random_counter; uint8_t random_numbers[SCTP_SIGNATURE_ALOC_SIZE]; uint8_t random_store[SCTP_SIGNATURE_ALOC_SIZE]; /* * This timer is kept running per endpoint. When it fires it will * change the secret key. The default is once a hour */ struct sctp_timer signature_change; uint32_t def_cookie_life; /* defaults to 0 */ uint32_t auto_close_time; uint32_t initial_sequence_debug; uint32_t adaptation_layer_indicator; uint8_t adaptation_layer_indicator_provided; uint32_t store_at; uint32_t max_burst; uint32_t fr_max_burst; #ifdef INET6 uint32_t default_flowlabel; #endif uint8_t default_dscp; char current_secret_number; char last_secret_number; uint16_t port; /* remote UDP encapsulation port */ }; #ifndef SCTP_ALIGNMENT #define SCTP_ALIGNMENT 32 #endif #ifndef SCTP_ALIGNM1 #define SCTP_ALIGNM1 (SCTP_ALIGNMENT-1) #endif #define sctp_lport ip_inp.inp.inp_lport struct sctp_pcbtsn_rlog { uint32_t vtag; uint16_t strm; uint16_t seq; uint16_t sz; uint16_t flgs; }; #define SCTP_READ_LOG_SIZE 135 /* we choose the number to make a pcb a page */ struct sctp_inpcb { /*- * put an inpcb in front of it all, kind of a waste but we need to * for compatibility with all the other stuff. */ union { struct inpcb inp; char align[(sizeof(struct inpcb) + SCTP_ALIGNM1) & ~SCTP_ALIGNM1]; } ip_inp; /* Socket buffer lock protects read_queue and of course sb_cc */ struct sctp_readhead read_queue; LIST_ENTRY(sctp_inpcb) sctp_list; /* lists all endpoints */ /* hash of all endpoints for model */ LIST_ENTRY(sctp_inpcb) sctp_hash; /* count of local addresses bound, 0 if bound all */ int laddr_count; /* list of addrs in use by the EP, NULL if bound-all */ struct sctpladdr sctp_addr_list; /* * used for source address selection rotation when we are subset * bound */ struct sctp_laddr *next_addr_touse; /* back pointer to our socket */ struct socket *sctp_socket; uint64_t sctp_features; /* Feature flags */ uint32_t sctp_flags; /* INP state flag set */ uint32_t sctp_mobility_features; /* Mobility Feature flags */ struct sctp_pcb sctp_ep; /* SCTP ep data */ /* head of the hash of all associations */ struct sctpasochead *sctp_tcbhash; u_long sctp_hashmark; /* head of the list of all associations */ struct sctpasochead sctp_asoc_list; #ifdef SCTP_TRACK_FREED_ASOCS struct sctpasochead sctp_asoc_free_list; #endif uint32_t sctp_frag_point; uint32_t partial_delivery_point; uint32_t sctp_context; uint32_t max_cwnd; uint8_t local_strreset_support; uint32_t sctp_cmt_on_off; uint8_t ecn_supported; uint8_t prsctp_supported; uint8_t auth_supported; uint8_t idata_supported; uint8_t asconf_supported; uint8_t reconfig_supported; uint8_t nrsack_supported; uint8_t pktdrop_supported; struct sctp_nonpad_sndrcvinfo def_send; /*- * These three are here for the sosend_dgram * (pkt, pkt_last and control). * routine. However, I don't think anyone in * the current FreeBSD kernel calls this. So * they are candidates with sctp_sendm for * de-supporting. */ struct mbuf *pkt, *pkt_last; struct mbuf *control; struct mtx inp_mtx; struct mtx inp_create_mtx; struct mtx inp_rdata_mtx; int32_t refcount; uint32_t def_vrf_id; uint16_t fibnum; uint32_t total_sends; uint32_t total_recvs; uint32_t last_abort_code; uint32_t total_nospaces; struct sctpasochead *sctp_asocidhash; u_long hashasocidmark; uint32_t sctp_associd_counter; #ifdef SCTP_ASOCLOG_OF_TSNS struct sctp_pcbtsn_rlog readlog[SCTP_READ_LOG_SIZE]; uint32_t readlog_index; #endif }; struct sctp_tcb { struct socket *sctp_socket; /* back pointer to socket */ struct sctp_inpcb *sctp_ep; /* back pointer to ep */ LIST_ENTRY(sctp_tcb) sctp_tcbhash; /* next link in hash * table */ LIST_ENTRY(sctp_tcb) sctp_tcblist; /* list of all of the * TCB's */ LIST_ENTRY(sctp_tcb) sctp_tcbasocidhash; /* next link in asocid * hash table */ LIST_ENTRY(sctp_tcb) sctp_asocs; /* vtag hash list */ struct sctp_block_entry *block_entry; /* pointer locked by socket * send buffer */ struct sctp_association asoc; /*- * freed_by_sorcv_sincelast is protected by the sockbuf_lock NOT the * tcb_lock. Its special in this way to help avoid extra mutex calls * in the reading of data. */ uint32_t freed_by_sorcv_sincelast; uint32_t total_sends; uint32_t total_recvs; int freed_from_where; uint16_t rport; /* remote port in network format */ uint16_t resv; struct mtx tcb_mtx; }; #include #if defined(_KERNEL) || defined(__Userspace__) /* Attention Julian, this is the extern that * goes with the base info. sctp_pcb.c has * the real definition. */ VNET_DECLARE(struct sctp_base_info, system_base_info); #ifdef INET6 int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b); #endif void sctp_fill_pcbinfo(struct sctp_pcbinfo *); struct sctp_ifn *sctp_find_ifn(void *ifn, uint32_t ifn_index); struct sctp_vrf *sctp_allocate_vrf(int vrfid); struct sctp_vrf *sctp_find_vrf(uint32_t vrfid); void sctp_free_vrf(struct sctp_vrf *vrf); /*- * Change address state, can be used if * O/S supports telling transports about * changes to IFA/IFN's (link layer triggers). * If a ifn goes down, we will do src-addr-selection * and NOT use that, as a source address. This does * not stop the routing system from routing out * that interface, but we won't put it as a source. */ void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); struct sctp_ifa * sctp_add_addr_to_vrf(uint32_t vrfid, void *ifn, uint32_t ifn_index, uint32_t ifn_type, const char *if_name, void *ifa, struct sockaddr *addr, uint32_t ifa_flags, int dynamic_add); void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu); void sctp_free_ifn(struct sctp_ifn *sctp_ifnp); void sctp_free_ifa(struct sctp_ifa *sctp_ifap); void sctp_del_addr_from_vrf(uint32_t vrfid, struct sockaddr *addr, uint32_t ifn_index, const char *if_name); struct sctp_nets *sctp_findnet(struct sctp_tcb *, struct sockaddr *); struct sctp_inpcb *sctp_pcb_findep(struct sockaddr *, int, int, uint32_t); int sctp_inpcb_bind(struct socket *, struct sockaddr *, struct sctp_ifa *, struct thread *); int sctp_inpcb_bind_locked(struct sctp_inpcb *, struct sockaddr *, struct sctp_ifa *, struct thread *); struct sctp_tcb * sctp_findassociation_addr(struct mbuf *, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb **, struct sctp_nets **, uint32_t vrf_id); struct sctp_tcb * sctp_findassociation_addr_sa(struct sockaddr *, struct sockaddr *, struct sctp_inpcb **, struct sctp_nets **, int, uint32_t); void sctp_move_pcb_and_assoc(struct sctp_inpcb *, struct sctp_inpcb *, struct sctp_tcb *); /*- * For this call ep_addr, the to is the destination endpoint address of the * peer (relative to outbound). The from field is only used if the TCP model * is enabled and helps distingush amongst the subset bound (non-boundall). * The TCP model MAY change the actual ep field, this is why it is passed. */ struct sctp_tcb * sctp_findassociation_ep_addr(struct sctp_inpcb **, struct sockaddr *, struct sctp_nets **, struct sockaddr *, struct sctp_tcb *); struct sctp_tcb *sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock); struct sctp_tcb * sctp_findassociation_ep_asocid(struct sctp_inpcb *, sctp_assoc_t, int); struct sctp_tcb * sctp_findassociation_ep_asconf(struct mbuf *, int, struct sockaddr *, struct sctphdr *, struct sctp_inpcb **, struct sctp_nets **, uint32_t vrf_id); int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id); int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id); void sctp_inpcb_free(struct sctp_inpcb *, int, int); #define SCTP_DONT_INITIALIZE_AUTH_PARAMS 0 #define SCTP_INITIALIZE_AUTH_PARAMS 1 struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *, int *, uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *, int); struct sctp_tcb * sctp_aloc_assoc_connected(struct sctp_inpcb *, struct sockaddr *, int *, uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *, int); int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int); void sctp_add_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *, uint32_t); void sctp_del_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *); int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets **, uint16_t, int, int); void sctp_remove_net(struct sctp_tcb *, struct sctp_nets *); int sctp_del_remote_addr(struct sctp_tcb *, struct sockaddr *); void sctp_pcb_init(void); void sctp_pcb_finish(void); void sctp_add_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); int sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int, struct sockaddr *, struct sockaddr *, struct sockaddr *, uint16_t); int sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets *); bool sctp_is_vtag_good(uint32_t, uint16_t lport, uint16_t rport, struct timeval *); -/* void sctp_drain(void); */ - int sctp_destination_is_reachable(struct sctp_tcb *, struct sockaddr *); int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp); void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh); void sctp_pcb_add_flags(struct sctp_inpcb *, uint32_t); /*- * Null in last arg inpcb indicate run on ALL ep's. Specific inp in last arg * indicates run on ONLY assoc's of the specified endpoint. */ int sctp_initiate_iterator(inp_func inpf, asoc_func af, inp_func inpe, uint32_t, uint32_t, uint32_t, void *, uint32_t, end_func ef, struct sctp_inpcb *, uint8_t co_off); #if defined(SCTP_MCORE_INPUT) && defined(SMP) void sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use); #endif #endif /* _KERNEL */ #endif /* !__sctp_pcb_h__ */ diff --git a/sys/netinet/sctp_var.h b/sys/netinet/sctp_var.h index 16beaa7f8b12..3bff09adb367 100644 --- a/sys/netinet/sctp_var.h +++ b/sys/netinet/sctp_var.h @@ -1,349 +1,348 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_VAR_H_ #define _NETINET_SCTP_VAR_H_ #include #if defined(_KERNEL) || defined(__Userspace__) extern struct pr_usrreqs sctp_usrreqs; #define sctp_feature_on(inp, feature) (inp->sctp_features |= feature) #define sctp_feature_off(inp, feature) (inp->sctp_features &= ~feature) #define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature) #define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0) #define sctp_stcb_feature_on(inp, stcb, feature) {\ if (stcb) { \ stcb->asoc.sctp_features |= feature; \ } else if (inp) { \ inp->sctp_features |= feature; \ } \ } #define sctp_stcb_feature_off(inp, stcb, feature) {\ if (stcb) { \ stcb->asoc.sctp_features &= ~feature; \ } else if (inp) { \ inp->sctp_features &= ~feature; \ } \ } #define sctp_stcb_is_feature_on(inp, stcb, feature) \ (((stcb != NULL) && \ ((stcb->asoc.sctp_features & feature) == feature)) || \ ((stcb == NULL) && (inp != NULL) && \ ((inp->sctp_features & feature) == feature))) #define sctp_stcb_is_feature_off(inp, stcb, feature) \ (((stcb != NULL) && \ ((stcb->asoc.sctp_features & feature) == 0)) || \ ((stcb == NULL) && (inp != NULL) && \ ((inp->sctp_features & feature) == 0)) || \ ((stcb == NULL) && (inp == NULL))) /* managing mobility_feature in inpcb (by micchie) */ #define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature) #define sctp_mobility_feature_off(inp, feature) (inp->sctp_mobility_features &= ~feature) #define sctp_is_mobility_feature_on(inp, feature) (inp->sctp_mobility_features & feature) #define sctp_is_mobility_feature_off(inp, feature) ((inp->sctp_mobility_features & feature) == 0) #define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND)) #define sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0)) #define sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > SCTP_SBAVAIL(sb)) ? (sctp_maxspace(sb) - SCTP_SBAVAIL(sb)) : 0)) #define sctp_sbspace_sub(a,b) (((a) > (b)) ? ((a) - (b)) : 0) /* * I tried to cache the readq entries at one point. But the reality * is that it did not add any performance since this meant we had to * lock the STCB on read. And at that point once you have to do an * extra lock, it really does not matter if the lock is in the ZONE * stuff or in our code. Note that this same problem would occur with * an mbuf cache as well so it is not really worth doing, at least * right now :-D */ #ifdef INVARIANTS #define sctp_free_a_readq(_stcb, _readq) { \ if ((_readq)->on_strm_q) \ panic("On strm q stcb:%p readq:%p", (_stcb), (_readq)); \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \ SCTP_DECR_READQ_COUNT(); \ } #else #define sctp_free_a_readq(_stcb, _readq) { \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \ SCTP_DECR_READQ_COUNT(); \ } #endif #define sctp_alloc_a_readq(_stcb, _readq) { \ (_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \ if ((_readq)) { \ SCTP_INCR_READQ_COUNT(); \ } \ } #define sctp_free_a_strmoq(_stcb, _strmoq, _so_locked) { \ if ((_strmoq)->holds_key_ref) { \ sctp_auth_key_release(stcb, sp->auth_keyid, _so_locked); \ (_strmoq)->holds_key_ref = 0; \ } \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), (_strmoq)); \ SCTP_DECR_STRMOQ_COUNT(); \ } #define sctp_alloc_a_strmoq(_stcb, _strmoq) { \ (_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \ if ((_strmoq)) { \ memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \ SCTP_INCR_STRMOQ_COUNT(); \ (_strmoq)->holds_key_ref = 0; \ } \ } #define sctp_free_a_chunk(_stcb, _chk, _so_locked) { \ if ((_chk)->holds_key_ref) {\ sctp_auth_key_release((_stcb), (_chk)->auth_keyid, _so_locked); \ (_chk)->holds_key_ref = 0; \ } \ if (_stcb) { \ SCTP_TCB_LOCK_ASSERT((_stcb)); \ if ((_chk)->whoTo) { \ sctp_free_remote_addr((_chk)->whoTo); \ (_chk)->whoTo = NULL; \ } \ if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \ (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ SCTP_DECR_CHK_COUNT(); \ } else { \ TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ (_stcb)->asoc.free_chunk_cnt++; \ atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ } \ } else { \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ SCTP_DECR_CHK_COUNT(); \ } \ } #define sctp_alloc_a_chunk(_stcb, _chk) { \ if (TAILQ_EMPTY(&(_stcb)->asoc.free_chunks)) { \ (_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \ if ((_chk)) { \ SCTP_INCR_CHK_COUNT(); \ (_chk)->whoTo = NULL; \ (_chk)->holds_key_ref = 0; \ } \ } else { \ (_chk) = TAILQ_FIRST(&(_stcb)->asoc.free_chunks); \ TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ (_chk)->holds_key_ref = 0; \ SCTP_STAT_INCR(sctps_cached_chk); \ (_stcb)->asoc.free_chunk_cnt--; \ } \ } #define sctp_free_remote_addr(__net) { \ if ((__net)) { \ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \ RO_NHFREE(&(__net)->ro); \ if ((__net)->src_addr_selected) { \ sctp_free_ifa((__net)->ro._s_addr); \ (__net)->ro._s_addr = NULL; \ } \ (__net)->src_addr_selected = 0; \ (__net)->dest_state &= ~SCTP_ADDR_REACHABLE; \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \ SCTP_DECR_RADDR_COUNT(); \ } \ } \ } #define sctp_sbfree(ctl, stcb, sb, m) { \ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \ if (((ctl)->do_not_ref_stcb == 0) && stcb) {\ SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \ SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \ } \ if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \ SCTP_BUF_TYPE(m) != MT_OOBDATA) \ atomic_subtract_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \ } #define sctp_sballoc(stcb, sb, m) { \ atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \ atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \ if (stcb) { \ atomic_add_int(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \ atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \ } \ if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \ SCTP_BUF_TYPE(m) != MT_OOBDATA) \ atomic_add_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \ } #define sctp_ucount_incr(val) { \ val++; \ } #define sctp_ucount_decr(val) { \ if (val > 0) { \ val--; \ } else { \ val = 0; \ } \ } #define sctp_mbuf_crush(data) do { \ struct mbuf *_m; \ _m = (data); \ while (_m && (SCTP_BUF_LEN(_m) == 0)) { \ (data) = SCTP_BUF_NEXT(_m); \ SCTP_BUF_NEXT(_m) = NULL; \ sctp_m_free(_m); \ _m = (data); \ } \ } while (0) #define sctp_flight_size_decrease(tp1) do { \ if (tp1->whoTo->flight_size >= tp1->book_size) \ tp1->whoTo->flight_size -= tp1->book_size; \ else \ tp1->whoTo->flight_size = 0; \ } while (0) #define sctp_flight_size_increase(tp1) do { \ (tp1)->whoTo->flight_size += (tp1)->book_size; \ } while (0) #ifdef SCTP_FS_SPEC_LOG #define sctp_total_flight_decrease(stcb, tp1) do { \ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ stcb->asoc.fs_index = 0;\ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.tsn; \ stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \ stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \ stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \ stcb->asoc.fs_index++; \ tp1->window_probe = 0; \ if (stcb->asoc.total_flight >= tp1->book_size) { \ stcb->asoc.total_flight -= tp1->book_size; \ if (stcb->asoc.total_flight_count > 0) \ stcb->asoc.total_flight_count--; \ } else { \ stcb->asoc.total_flight = 0; \ stcb->asoc.total_flight_count = 0; \ } \ } while (0) #define sctp_total_flight_increase(stcb, tp1) do { \ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ stcb->asoc.fs_index = 0;\ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.tsn; \ stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \ stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \ stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \ stcb->asoc.fs_index++; \ (stcb)->asoc.total_flight_count++; \ (stcb)->asoc.total_flight += (tp1)->book_size; \ } while (0) #else #define sctp_total_flight_decrease(stcb, tp1) do { \ tp1->window_probe = 0; \ if (stcb->asoc.total_flight >= tp1->book_size) { \ stcb->asoc.total_flight -= tp1->book_size; \ if (stcb->asoc.total_flight_count > 0) \ stcb->asoc.total_flight_count--; \ } else { \ stcb->asoc.total_flight = 0; \ stcb->asoc.total_flight_count = 0; \ } \ } while (0) #define sctp_total_flight_increase(stcb, tp1) do { \ (stcb)->asoc.total_flight_count++; \ (stcb)->asoc.total_flight += (tp1)->book_size; \ } while (0) #endif #define SCTP_PF_ENABLED(_net) (_net->pf_threshold < _net->failure_threshold) #define SCTP_NET_IS_PF(_net) (_net->pf_threshold < _net->error_count) struct sctp_nets; struct sctp_inpcb; struct sctp_tcb; struct sctphdr; void sctp_close(struct socket *so); int sctp_disconnect(struct socket *so); void sctp_ctlinput(int, struct sockaddr *, void *); int sctp_ctloutput(struct socket *, struct sockopt *); void sctp_input_with_port(struct mbuf *, int, uint16_t); int sctp_input(struct mbuf **, int *, int); void sctp_pathmtu_adjustment(struct sctp_tcb *, uint32_t, bool); -void sctp_drain(void); void sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, uint8_t, uint8_t, uint16_t, uint32_t); int sctp_flush(struct socket *, int); int sctp_shutdown(struct socket *); int sctp_bindx(struct socket *, int, struct sockaddr_storage *, int, int, struct proc *); /* can't use sctp_assoc_t here */ int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *); int sctp_ingetaddr(struct socket *, struct sockaddr **); int sctp_peeraddr(struct socket *, struct sockaddr **); int sctp_listen(struct socket *, int, struct thread *); int sctp_accept(struct socket *, struct sockaddr **); #endif /* _KERNEL */ #endif /* !_NETINET_SCTP_VAR_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 70d1d2fb942a..e26fe0ec247e 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4149 +1,4156 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif #ifdef NETFLIX_EXP_DETECTION /* Sack attack detection thresholds and such */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sack Attack detection thresholds"); int32_t tcp_force_detection = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, &tcp_sack_to_ack_thresh, 700, "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, CTLFLAG_RW, &tcp_sack_to_move_thresh, 600, "Percentage of sack moves we must see above (10.1 percent is 101)"); int32_t tcp_restoral_thresh = 650; /* 65 % (sack:2:ack -5%) */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, CTLFLAG_RW, &tcp_restoral_thresh, 550, "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); int32_t tcp_sad_decay_val = 800; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, CTLFLAG_RW, &tcp_sad_decay_val, 800, "The decay percentage (10.1 percent equals 101 )"); int32_t tcp_map_minimum = 500; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, CTLFLAG_RW, &tcp_map_minimum, 500, "Number of Map enteries before we start detection"); int32_t tcp_attack_on_turns_on_logging = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged, CTLFLAG_RW, &tcp_attack_on_turns_on_logging, 0, "When we have a positive hit on attack, do we turn on logging?"); int32_t tcp_sad_pacing_interval = 2000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, CTLFLAG_RW, &tcp_sad_pacing_interval, 2000, "What is the minimum pacing interval for a classified attacker?"); int32_t tcp_sad_low_pps = 100; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, CTLFLAG_RW, &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif uint32_t tcp_ack_war_time_window = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_RW, &tcp_ack_war_time_window, 1000, "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); uint32_t tcp_ack_war_cnt = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_RW, &tcp_ack_war_cnt, 5, "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); /* * Release the old stack. This function will either find a new one * or panic. */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init == NULL || (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) return; /* * Initialization failed. Release the reference count on * the stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init != NULL && (*tp->t_fb->tfb_tcp_fb_init)(tp)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static void tcp_over_udp_stop(void) { /* * This function assumes sysctl caller holds inp_rinfo_lock() * for writing! */ #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* * This function assumes sysctl caller holds inp_info_rlock() * for writing! */ port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); } } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp) { struct socket *so; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ so = tp->t_inpcb->inp_socket; if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tp->t_inpcb); return; } /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; #ifdef TCP_HHOOK struct osd osd; #endif }; VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { *num_names = 0; return (EINVAL); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; return (EINVAL); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed, or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_flags & INP_TIMEWAIT) continue; tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); +static void tcp_drain(void); + static void tcp_init(void *arg __unused) { const char *tcbhash_tuneable; int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); + EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); + EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif hashsize = TCBHASHSIZE; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts; log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ cc_attach(tp, CC_DEFAULT_ALGO()); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef TCP_HHOOK tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #endif #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; TAILQ_INIT(&tp->t_segq); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) { refcount_release(&tp->t_fb->tfb_refcnt); in_pcbrele_wlocked(inp); uma_zfree(V_tcpcb_zone, tm); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tp->t_timers->tt_draincnt = 0; tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { bool released __diagused; released = tcp_freecb(tp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } bool tcp_freecb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); MPASS(tp->t_timers->tt_draincnt == 0); /* We own the last reference on tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. * Lets also at least protect against the so being NULL * as RW stated below. */ if ((tp->t_rttupdated >= 4) && (so != NULL)) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tp); return (in_pcbrele_wlocked(inp)); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #ifdef TCPHPTS tcp_hpts_remove(inp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } -void +static void tcp_drain(void) { + struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; + NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if (inpb->inp_flags & INP_TIMEWAIT) continue; if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { int crerr; /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) crerr = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else crerr = EINVAL; /* Skip this inp. */ } else crerr = cr_canseeinpcb(req->td->td_ucred, inp); if (crerr == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } icmp_tcp_seq = th->th_seq; if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { tcp_ctlinput_with_port(cmd, sa, vip, htons(0)); } void tcp_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *vip, void *unused) { /* Its a tunneled TCP over UDP icmp */ struct ip *outer_ip, *inner_ip; struct icmp *icmp; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; inner_ip = (struct ip *)vip; icmp = (struct icmp *)((caddr_t)inner_ip - (sizeof(struct icmp) - sizeof(struct ip))); outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(cmd, sa, vip, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(int cmd, struct sockaddr *sa, void *d, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; dst = NULL; } if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) notify = tcp_drop_syn_sent; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip6 = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) return; if (ip6 == NULL) { in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); return; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ inp = (*notify)(inp, EHOSTDOWN); goto out; } off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, inet6ctlerrmap[cmd]); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { tcp6_ctlinput_with_port(cmd, sa, d, htons(0)); } void tcp6_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *d, void *unused) { struct ip6ctlparam *ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; ip6cp = (struct ip6ctlparam *)d; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(cmd, sa, d, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) tp->t_maxseg = TCP6_MSS; } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if ((inp->inp_flags & INP_DROPPED) == 0 && !SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 || inp->inp_socket == NULL) { error = ECONNRESET; INP_WUNLOCK(inp); } else { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); struct tcptw *tw = intotw(inp); sbintime_t now; bzero(xt, sizeof(*xt)); if (inp->inp_flags & INP_TIMEWAIT) { xt->t_state = TCPS_TIME_WAIT; xt->xt_encaps_port = tw->t_port; } else { xt->t_state = tp->t_state; xt->t_logstate = tp->t_logstate; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(ttt) do { \ if (callout_active(&tp->t_timers->ttt)) \ xt->ttt = (tp->t_timers->ttt.c_time - now) / \ SBT_1MS; \ else \ xt->ttt = 0; \ } while (0) COPYTIMER(tt_delack); COPYTIMER(tt_rexmt); COPYTIMER(tt_persist); COPYTIMER(tt_keep); COPYTIMER(tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif } xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { return (0); } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index fa86ab51d68b..62d64f2dbdb2 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1305 +1,1304 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include "opt_kern_tls.h" #include #include #include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from tcp_get_flags() */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time when first byte queued in */ u_int t_fbyte_out; /* ticks time when first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ uint32_t r_cep; /* Number of received CE marked packets */ uint32_t s_cep; /* Synced number of delivered CE packets */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint32_t t_dsack_bytes; /* Total number of dsack bytes we have received */ uint32_t t_dsack_tlp_bytes; /* Total number of dsack bytes we have received for TLPs sent */ uint32_t t_dsack_pack; /* Total dsack packets we have recieved */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef _NETINET_IN_PCB_H_ /* * tcp_output() * Handles tcp_drop request from advanced stacks and reports that inpcb is * gone with negative return code. * Drop in replacement for the default stack. */ static inline int tcp_output(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); tp = tcp_drop(tp, -rv); if (tp) INP_WUNLOCK(tp->t_inpcb); } return (rv); } /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. * Always returns positive error code. */ static inline int tcp_output_unlock(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); rv = -rv; tp = tcp_drop(tp, rv); if (tp) INP_WUNLOCK(tp->t_inpcb); } else INP_WUNLOCK(tp->t_inpcb); return (rv); } /* * tcp_output_nodrop() * Always returns locked. It is caller's responsibility to run tcp_drop()! * Useful in syscall implementations, when we want to perform some logging * and/or tracing with tcpcb before calling tcp_drop(). To be used with * tcp_unlock_or_drop() later. * * XXXGL: maybe don't allow stacks to return a drop request at certain * TCP states? Why would it do in connect(2)? In recv(2)? */ static inline int tcp_output_nodrop(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); return (rv); } /* * tcp_unlock_or_drop() * Handle return code from tfb_tcp_output() after we have logged/traced, * to be used with tcp_output_nodrop(). */ static inline int tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) { INP_WLOCK_ASSERT(tp->t_inpcb); if (tcp_output_retval < 0) { tcp_output_retval = -tcp_output_retval; if (tcp_drop(tp, tcp_output_retval) != NULL) INP_WUNLOCK(tp->t_inpcb); } else INP_WUNLOCK(tp->t_inpcb); return (tcp_output_retval); } #endif /* _NETINET_IN_PCB_H_ */ #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid * Note: accessing and restoring from * these may only be done in the 1st * RTO recovery round (t_rxtshift == 1) */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ #define TF_UNUSED1 0x08000000 /* unused */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ uint32_t t_port:16, /* UDP port number if TCPoUDP */ t_unused:16; tcp_seq snd_nxt; tcp_seq rcv_nxt; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; u_int tw_flags; /* tcpcb t_flags */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ /* Dsack related stats */ uint64_t tcps_dsack_count; /* Number of ACKs arriving with DSACKs */ uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */ uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */ /* TCPS_TIME_WAIT usage stats */ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */ /* Accurate ECN Handshake stats */ uint64_t tcps_ace_nect; /* ACE SYN packet with Non-ECT */ uint64_t tcps_ace_ect1; /* ACE SYN packet with ECT1 */ uint64_t tcps_ace_ect0; /* ACE SYN packet with ECT0 */ uint64_t tcps_ace_ce; /* ACE SYN packet with CE */ uint64_t _pad[6]; /* 6 TBD placeholder for STABLE */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); #define KMOD_TCPSTAT_ADD(name, val) \ kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint32_t t_dsack_bytes; /* (n) */ uint32_t t_dsack_tlp_bytes; /* (n) */ uint32_t t_dsack_pack; /* (n) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[22]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); bool tcp_freecb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *); -void tcp_drain(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_handle_wakeup(struct tcpcb *, struct socket *); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt); int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); void tcp_decrement_paced_conn(void); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } static inline uint16_t tcp_get_flags(const struct tcphdr *th) { return (((uint16_t)th->th_x2 << 8) | th->th_flags); } static inline void tcp_set_flags(struct tcphdr *th, uint16_t flags) { th->th_x2 = (flags >> 8) & 0x0f; th->th_flags = flags & 0xff; } static inline void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp, int hw_tls) { if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; #ifdef KERN_TLS if (hw_tls && is_rxt && len != 0) { uint64_t rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); if (rexmit_percent > ktls_ifnet_max_rexmit_pct) ktls_disable_ifnet(tp); } #endif } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 52534c579003..963b6a8d9aed 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -1,573 +1,558 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_sctp.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ FEATURE(inet6, "Internet Protocol version 6"); extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 /* Spacer for loadable protocols. */ #define IP6PROTOSPACER \ { \ .pr_domain = &inet6domain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inet6sw[] = { -{ - .pr_type = 0, - .pr_domain = &inet6domain, - .pr_protocol = IPPROTO_IPV6, - .pr_flags = PR_CAPATTACH, - .pr_drain = frag6_drain, - .pr_usrreqs = &nousrreqs, -}, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_ctloutput = ip6_ctloutput, .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| PR_LISTEN|PR_CAPATTACH, .pr_ctloutput = tcp_ctloutput, -#ifndef INET /* don't call initialization, timeout, and drain routines twice */ - .pr_drain = tcp_drain, -#endif .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, -#ifndef INET /* Do not call initialization and drain routines twice. */ - .pr_drain = sctp_drain, -#endif .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD, .pr_ctloutput = sctp_ctloutput, - .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_usrreqs = &nousrreqs }, #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, /* Spacer n-times for loadable protocols. */ IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, .dom_protoswNPROTOSW = (struct protosw *)&inet6sw[nitems(inet6sw)], .dom_rtattach = in6_inithead, #ifdef VIMAGE .dom_rtdetach = in6_detachhead, #endif .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_ifmtu = in6_domifmtu }; DOMAIN_SET(inet6); /* * Internet configuration info */ #ifndef IPV6FORWARDING #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif /* GATEWAY6 */ #endif /* !IPV6FORWARDING */ #ifndef IPV6_SENDREDIRECTS #define IPV6_SENDREDIRECTS 1 #endif VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING; /* act as router? */ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ VNET_DEFINE(int, ip6_dad_count) = 1; /* DupAddrDetectionTransmits */ VNET_DEFINE(int, ip6_auto_flowlabel) = 1; VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr * (RFC2462 5.5.4) */ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix * walk list every 5 sec. */ VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */ VNET_DEFINE(int, ip6_v6only) = 1; VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L; #ifdef IPSTEALTH VNET_DEFINE(int, ip6stealth) = 0; #endif VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS * (RFC 4861) */ /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ VNET_DEFINE(int, pmtu_expire) = 60*10; VNET_DEFINE(int, pmtu_probe) = 60*2; /* ICMPV6 parameters */ VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */ VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ VNET_DEFINE(int, icmp6errppslim) = 100; /* 100pps */ /* control how to respond to NI queries */ VNET_DEFINE(int, icmp6_nodeinfo) = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1; /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP6"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP6"); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_preferred_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_desync_factor + V_ip6_temp_regen_advance) return (EINVAL); V_ip6_temp_preferred_lifetime = val; return (0); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_valid_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_temp_preferred_lifetime) return (EINVAL); V_ip6_temp_valid_lifetime = val; return (0); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "Enable forwarding of IPv6 packets between interfaces"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, "Send ICMPv6 redirects for unforwardable IPv6 packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0, "Default hop limit to use for outgoing IPv6 packets"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 RA messages"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, "Default value of per-interface flag to control whether routers " "sending ICMPv6 RA messages on that interface are added into the " "default router list"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0, "Always set clear the R flag in ICMPv6 NA messages when accepting RA " "on the interface"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, "Accept the default router list from ICMPv6 RA messages even " "when packet forwarding is enabled"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "Frequency in seconds at which to log IPv6 forwarding errors"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "Default maximum number of IPv6 extension headers permitted on " "incoming IPv6 packets, 0 for no artificial limit"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, "Number of ICMPv6 NS messages sent during duplicate address detection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "Provide an IPv6 flowlabel in outbound packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "Default hop limit for IPv6 multicast packets originating from this " "node"); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, "KAME version string"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "Allow the use of addresses whose preferred lifetimes have expired"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "Create RFC3041 temporary addresses for autoconfigured addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_temppltime, "I", "Maximum preferred lifetime for temporary addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_tempvltime, "I", "Maximum valid lifetime for temporary addresses"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, "Restrict AF_INET6 sockets to IPv6 addresses only"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, "Default value of per-interface flag for automatically adding an IPv6 " "link-local address to interfaces when attached"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, struct rip6stat, rip6stat, "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "Prefer RFC3041 temporary addresses in source address selection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, "Use the default scope zone when none is specified"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "Enable path MTU discovery for multicast packets"); #ifdef IPSTEALTH SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, "Forward IPv6 packets without decrementing their TTL"); #endif /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "Accept ICMPv6 redirect messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "Delay in seconds before expiring redirect route"); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, "Frequency in seconds of checks for expired prefixes and routers"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, "Delay in seconds before probing for reachability"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "Number of ICMPv6 NS messages sent during reachability detection"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "Number of ICMPv6 NS messages sent during address resolution"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "Create a loopback route when configuring an IPv6 address"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "Mask of enabled RFC4620 node information query types"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, "Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup " "for compatibility with KAME implementation"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "Maximum number of ICMPv6 error messages per second"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, "Log NDP debug messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861"); #ifdef EXPERIMENTAL SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_ignore_ipv6_only_ra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_ignore_ipv6_only_ra), 0, "Ignore the 'IPv6-Only flag' in RA messages in compliance with " "draft-ietf-6man-ipv6only-flag"); #endif diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index 8d8cef359d90..52c70292f920 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -1,1724 +1,1728 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include ipproto_input_t *ip6_protox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip6_input }; ipproto_ctlinput_t *ip6_ctlprotox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip6_ctlinput }; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; static int sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_nh, qlimit)); } SYSCTL_DECL(_net_inet6_ip6); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I", "Maximum size of the IPv6 input queue"); VNET_DEFINE_STATIC(bool, ip6_sav) = true; #define V_ip6_sav VNET(ip6_sav) SYSCTL_BOOL(_net_inet6_ip6, OID_AUTO, source_address_validation, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sav), true, "Drop incoming packets with source address that is a local address"); #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; static int sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen, "I", "Maximum size of the IPv6 direct input queue"); #endif VNET_DEFINE(pfil_head_t, inet6_pfil_head); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ static void ip6_vnet_init(void *arg __unused) { struct pfil_head_args args; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); CK_STAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP6; args.pa_headname = PFIL_INET6_NAME; V_inet6_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ #ifdef VIMAGE netisr_register_vnet(&ip6_nh); #ifdef RSS netisr_register_vnet(&ip6_direct_nh); #endif #endif } VNET_SYSINIT(ip6_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, ip6_vnet_init, NULL); static void ip6_init(void *arg __unused) { /* * Register statically those protocols that are unlikely to ever go * dynamic. */ IP6PROTO_REGISTER(IPPROTO_ICMPV6, icmp6_input, rip6_ctlinput); IP6PROTO_REGISTER(IPPROTO_DSTOPTS, dest6_input, NULL); IP6PROTO_REGISTER(IPPROTO_ROUTING, route6_input, NULL); IP6PROTO_REGISTER(IPPROTO_FRAGMENT, frag6_input, NULL); IP6PROTO_REGISTER(IPPROTO_IPV4, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_IPV6, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_ETHERIP, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_GRE, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_PIM, encap6_input, NULL); #ifdef SCTP /* XXX: has a loadable & static version */ IP6PROTO_REGISTER(IPPROTO_SCTP, sctp6_input, sctp6_ctlinput); #endif + EVENTHANDLER_REGISTER(vm_lowmem, frag6_drain, NULL, LOWMEM_PRI_DEFAULT); + EVENTHANDLER_REGISTER(mbuf_lowmem, frag6_drain, NULL, + LOWMEM_PRI_DEFAULT); + netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } SYSINIT(ip6_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_init, NULL); int ip6proto_register(uint8_t proto, ipproto_input_t input, ipproto_ctlinput_t ctl) { MPASS(proto > 0); if (ip6_protox[proto] == rip6_input) { ip6_protox[proto] = input; ip6_ctlprotox[proto] = ctl; return (0); } else return (EEXIST); } int ip6proto_unregister(uint8_t proto) { MPASS(proto > 0); if (ip6_protox[proto] != rip6_input) { ip6_protox[proto] = rip6_input; ip6_ctlprotox[proto] = rip6_ctlinput; return (0); } else return (ENOENT); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip6_direct_nh); #endif netisr_unregister_vnet(&ip6_nh); pfil_head_unregister(V_inet6_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach_destroy(ifp); mld_domifdetach(ifp); } IFNET_RUNLOCK(); /* Make sure any routes are gone as well. */ rib_flush_routes_family(AF_INET6); frag6_destroy(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int ip6_input_hbh(struct mbuf **mp, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct mbuf *m; struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, mp, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc((*mp)->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ m = *mp; ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = ip6_protox[nxt](&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; struct ifnet *rcvif; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; /* * Drop the packet if IPv6 operation is disabled on the interface. */ rcvif = m->m_pkthdr.rcvif; if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ ip6 = mtod(m, struct ip6_hdr *); goto passin; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { struct ifnet *ifp = (m->m_flags & M_LOOP) ? V_loif : rcvif; int ifindex = ifp->if_index; if (ifindex >= IP6S_M2MMAX) ifindex = 0; IP6STAT_INC(ip6s_m2m[ifindex]); } else IP6STAT_INC(ip6s_m1); } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) goto bad; m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * We have supported IPv6-only kernels for a few years and this issue * has not come up. The world seems to move mostly towards not using * v4mapped on the wire, so it makes sense for us to keep rejecting * any such packets. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Try to forward the packet, but if we fail continue. * ip6_tryforward() does not generate redirects, so fall * through to normal processing if redirects are required. * ip6_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip6 pointer. */ if (V_ip6_forwarding != 0 && V_ip6_sendredirects == 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv6) || IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip6_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { ip6 = mtod(m, struct ip6_hdr *); goto passin; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet6_pfil_head)) goto passin; odst = ip6->ip6_dst; if (pfil_run_hooks(V_inet6_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) != PFIL_PASS) return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); goto bad; } if (V_ip6_sav && !(m->m_flags & M_LOOP) && __predict_false(in6_localip_fib(&ip6->ip6_src, rcvif->if_fib))) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(&m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) { IP6STAT_INC(ip6s_cantforward); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = ip6_protox[nxt](&m, &off, nxt); } return; bad: in6_ifstat_inc(rcvif, ifs6_in_discard); if (m != NULL) m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ if (m->m_len < off + sizeof(*hbh)) { m = m_pullup(m, off + sizeof(*hbh)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (-1); } } hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; if (m->m_len < off + hbhlen) { m = m_pullup(m, off + hbhlen); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (-1); } } hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) { *mp = NULL; return (-1); } *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called m_pullup() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { union { struct timeval tv; struct bintime bt; struct timespec ts; } t; struct bintime boottimebin, bt1; struct timespec ts1; bool stamped; stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &t.tv); } else { microtime(&t.tv); } *mp = sbcreatecontrol(&t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_BINTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &t.bt); getboottimebin(&boottimebin); bintime_add(&t.bt, &boottimebin); } else { bintime(&t.bt); } *mp = sbcreatecontrol(&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_REALTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &t.ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&t.ts, &ts1, &t.ts); } else { nanotime(&t.ts); } *mp = sbcreatecontrol(&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_MONOTONIC: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &t.ts); else nanouptime(&t.ts); *mp = sbcreatecontrol(&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; default: panic("unknown (corrupted) so_ts_clock"); } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol(&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) mp = &(*mp)->m_next; } } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol(&pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol(&hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol(&tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *inp, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6; int v4only = 0; mp = ip6_savecontrol_v4(inp, m, mp, &v4only); if (v4only) return; ip6 = mtod(m, struct ip6_hdr *); /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; u_int hbhlen; hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol(hbh, hbhlen, IS2292(inp, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } } if ((inp->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; u_int elen; /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; switch (nxt) { case IPPROTO_DSTOPTS: if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(inp, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(inp, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; } loopend: ; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol(&flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol(&flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol(&rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6, M_NOWAIT)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { soroverflow(so); m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } /* * Get pointer to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_ext ip6e; struct ip6_hdr *ip6; int len, nlen, nxt; if (off == sizeof(struct ip6_hdr)) return (offsetof(struct ip6_hdr, ip6_nxt)); if (off < sizeof(struct ip6_hdr)) panic("%s: off < sizeof(struct ip6_hdr)", __func__); ip6 = mtod(m, struct ip6_hdr *); nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index 5d6e75abeda1..8c45431c83c3 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -1,322 +1,324 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EVENTHANDLER_H_ #define _SYS_EVENTHANDLER_H_ #include #include #include #include #include #ifdef VIMAGE struct eventhandler_entry_vimage { void (* func)(void); /* Original function registered. */ void *ee_arg; /* Original argument registered. */ void *sparep[2]; }; #endif struct eventhandler_list { char *el_name; int el_flags; /* Unused. */ u_int el_runcount; struct mtx el_lock; TAILQ_ENTRY(eventhandler_list) el_link; TAILQ_HEAD(,eventhandler_entry) el_entries; }; #define EHL_LOCK(p) mtx_lock(&(p)->el_lock) #define EHL_UNLOCK(p) mtx_unlock(&(p)->el_lock) #define EHL_LOCK_ASSERT(p, x) mtx_assert(&(p)->el_lock, x) /* * Macro to invoke the handlers for a given event. */ #define _EVENTHANDLER_INVOKE(name, list, ...) do { \ struct eventhandler_entry *_ep; \ struct eventhandler_entry_ ## name *_t; \ \ EHL_LOCK_ASSERT((list), MA_OWNED); \ (list)->el_runcount++; \ KASSERT((list)->el_runcount > 0, \ ("eventhandler_invoke: runcount overflow")); \ CTR0(KTR_EVH, "eventhandler_invoke(\"" __STRING(name) "\")"); \ TAILQ_FOREACH(_ep, &((list)->el_entries), ee_link) { \ if (_ep->ee_priority != EHE_DEAD_PRIORITY) { \ EHL_UNLOCK((list)); \ _t = (struct eventhandler_entry_ ## name *)_ep; \ CTR1(KTR_EVH, "eventhandler_invoke: executing %p", \ (void *)_t->eh_func); \ _t->eh_func(_ep->ee_arg , ## __VA_ARGS__); \ EHL_LOCK((list)); \ } \ } \ KASSERT((list)->el_runcount > 0, \ ("eventhandler_invoke: runcount underflow")); \ (list)->el_runcount--; \ if ((list)->el_runcount == 0) \ eventhandler_prune_list(list); \ EHL_UNLOCK((list)); \ } while (0) /* * You can optionally use the EVENTHANDLER_LIST and EVENTHANDLER_DIRECT macros * to pre-define a symbol for the eventhandler list. This symbol can be used by * EVENTHANDLER_DIRECT_INVOKE, which has the advantage of not needing to do a * locked search of the global list of eventhandler lists. At least * EVENTHANDLER_LIST_DEFINE must be used for EVENTHANDLER_DIRECT_INVOKE to * work. EVENTHANDLER_LIST_DECLARE is only needed if the call to * EVENTHANDLER_DIRECT_INVOKE is in a different compilation unit from * EVENTHANDLER_LIST_DEFINE. If the events are even relatively high frequency * it is suggested that you directly define a list for them. */ #define EVENTHANDLER_LIST_DEFINE(name) \ struct eventhandler_list *_eventhandler_list_ ## name ; \ static void _ehl_init_ ## name (void * ctx __unused) \ { \ _eventhandler_list_ ## name = eventhandler_create_list(#name); \ } \ SYSINIT(name ## _ehl_init, SI_SUB_EVENTHANDLER, SI_ORDER_ANY, \ _ehl_init_ ## name, NULL); \ struct __hack #define EVENTHANDLER_DIRECT_INVOKE(name, ...) do { \ struct eventhandler_list *_el; \ \ _el = _eventhandler_list_ ## name ; \ if (!TAILQ_EMPTY(&_el->el_entries)) { \ EHL_LOCK(_el); \ _EVENTHANDLER_INVOKE(name, _el , ## __VA_ARGS__); \ } \ } while (0) #define EVENTHANDLER_DEFINE(name, func, arg, priority) \ static eventhandler_tag name ## _tag; \ static void name ## _evh_init(void *ctx) \ { \ name ## _tag = EVENTHANDLER_REGISTER(name, func, ctx, \ priority); \ } \ SYSINIT(name ## _evh_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, \ name ## _evh_init, arg); \ struct __hack #define EVENTHANDLER_INVOKE(name, ...) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ _EVENTHANDLER_INVOKE(name, _el , ## __VA_ARGS__); \ } while (0) #define EVENTHANDLER_REGISTER(name, func, arg, priority) \ eventhandler_register(NULL, #name, func, arg, priority) #define EVENTHANDLER_DEREGISTER(name, tag) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ eventhandler_deregister(_el, tag); \ } while (0) #define EVENTHANDLER_DEREGISTER_NOWAIT(name, tag) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ eventhandler_deregister_nowait(_el, tag); \ } while (0) eventhandler_tag eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority); void eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag); void eventhandler_deregister_nowait(struct eventhandler_list *list, eventhandler_tag tag); struct eventhandler_list *eventhandler_find_list(const char *name); void eventhandler_prune_list(struct eventhandler_list *list); struct eventhandler_list *eventhandler_create_list(const char *name); #ifdef VIMAGE typedef void (*vimage_iterator_func_t)(void *, ...); eventhandler_tag vimage_eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority, vimage_iterator_func_t); #endif /* * Standard system event queues. */ /* Generic priority levels */ #define EVENTHANDLER_PRI_FIRST 0 #define EVENTHANDLER_PRI_ANY 10000 #define EVENTHANDLER_PRI_LAST 20000 /* Shutdown events */ typedef void (*shutdown_fn)(void *, int); #define SHUTDOWN_PRI_FIRST EVENTHANDLER_PRI_FIRST #define SHUTDOWN_PRI_DEFAULT EVENTHANDLER_PRI_ANY #define SHUTDOWN_PRI_LAST EVENTHANDLER_PRI_LAST EVENTHANDLER_DECLARE(shutdown_pre_sync, shutdown_fn); /* before fs sync */ EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn); /* after fs sync */ EVENTHANDLER_DECLARE(shutdown_final, shutdown_fn); /* Power state change events */ typedef void (*power_change_fn)(void *); EVENTHANDLER_DECLARE(power_resume, power_change_fn); EVENTHANDLER_DECLARE(power_suspend, power_change_fn); EVENTHANDLER_DECLARE(power_suspend_early, power_change_fn); /* Low memory event */ typedef void (*vm_lowmem_handler_t)(void *, int); #define LOWMEM_PRI_DEFAULT EVENTHANDLER_PRI_FIRST EVENTHANDLER_DECLARE(vm_lowmem, vm_lowmem_handler_t); +/* Some of mbuf(9) zones reached maximum */ +EVENTHANDLER_DECLARE(mbuf_lowmem, vm_lowmem_handler_t); /* Root mounted event */ typedef void (*mountroot_handler_t)(void *); EVENTHANDLER_DECLARE(mountroot, mountroot_handler_t); /* File system mount events */ struct mount; struct vnode; struct thread; typedef void (*vfs_mounted_notify_fn)(void *, struct mount *, struct vnode *, struct thread *); typedef void (*vfs_unmounted_notify_fn)(void *, struct mount *, struct thread *); EVENTHANDLER_DECLARE(vfs_mounted, vfs_mounted_notify_fn); EVENTHANDLER_DECLARE(vfs_unmounted, vfs_unmounted_notify_fn); /* * Process events * process_fork and exit handlers are called without Giant. * exec handlers are called with Giant, but that is by accident. */ struct proc; struct image_params; typedef void (*exitlist_fn)(void *, struct proc *); typedef void (*forklist_fn)(void *, struct proc *, struct proc *, int); typedef void (*execlist_fn)(void *, struct proc *, struct image_params *); typedef void (*proc_ctor_fn)(void *, struct proc *); typedef void (*proc_dtor_fn)(void *, struct proc *); typedef void (*proc_init_fn)(void *, struct proc *); typedef void (*proc_fini_fn)(void *, struct proc *); EVENTHANDLER_DECLARE(process_ctor, proc_ctor_fn); EVENTHANDLER_DECLARE(process_dtor, proc_dtor_fn); EVENTHANDLER_DECLARE(process_init, proc_init_fn); EVENTHANDLER_DECLARE(process_fini, proc_fini_fn); EVENTHANDLER_DECLARE(process_exit, exitlist_fn); EVENTHANDLER_DECLARE(process_fork, forklist_fn); EVENTHANDLER_DECLARE(process_exec, execlist_fn); /* * application dump event */ typedef void (*app_coredump_start_fn)(void *, struct thread *, char *name); typedef void (*app_coredump_progress_fn)(void *, struct thread *td, int byte_count); typedef void (*app_coredump_finish_fn)(void *, struct thread *td); typedef void (*app_coredump_error_fn)(void *, struct thread *td, char *msg, ...); EVENTHANDLER_DECLARE(app_coredump_start, app_coredump_start_fn); EVENTHANDLER_DECLARE(app_coredump_progress, app_coredump_progress_fn); EVENTHANDLER_DECLARE(app_coredump_finish, app_coredump_finish_fn); EVENTHANDLER_DECLARE(app_coredump_error, app_coredump_error_fn); typedef void (*thread_ctor_fn)(void *, struct thread *); typedef void (*thread_dtor_fn)(void *, struct thread *); typedef void (*thread_fini_fn)(void *, struct thread *); typedef void (*thread_init_fn)(void *, struct thread *); EVENTHANDLER_DECLARE(thread_ctor, thread_ctor_fn); EVENTHANDLER_DECLARE(thread_dtor, thread_dtor_fn); EVENTHANDLER_DECLARE(thread_init, thread_init_fn); EVENTHANDLER_DECLARE(thread_fini, thread_fini_fn); typedef void (*uma_zone_chfn)(void *); EVENTHANDLER_DECLARE(nmbclusters_change, uma_zone_chfn); EVENTHANDLER_DECLARE(nmbufs_change, uma_zone_chfn); EVENTHANDLER_DECLARE(maxsockets_change, uma_zone_chfn); /* Kernel linker file load and unload events */ struct linker_file; typedef void (*kld_load_fn)(void *, struct linker_file *); typedef void (*kld_unload_fn)(void *, const char *, caddr_t, size_t); typedef void (*kld_unload_try_fn)(void *, struct linker_file *, int *); EVENTHANDLER_DECLARE(kld_load, kld_load_fn); EVENTHANDLER_DECLARE(kld_unload, kld_unload_fn); EVENTHANDLER_DECLARE(kld_unload_try, kld_unload_try_fn); /* Generic graphics framebuffer interface */ struct fb_info; typedef void (*register_framebuffer_fn)(void *, struct fb_info *); typedef void (*unregister_framebuffer_fn)(void *, struct fb_info *); EVENTHANDLER_DECLARE(register_framebuffer, register_framebuffer_fn); EVENTHANDLER_DECLARE(unregister_framebuffer, unregister_framebuffer_fn); /* Veto ada attachment */ struct cam_path; struct ata_params; typedef void (*ada_probe_veto_fn)(void *, struct cam_path *, struct ata_params *, int *); EVENTHANDLER_DECLARE(ada_probe_veto, ada_probe_veto_fn); /* Swap device events */ struct swdevt; typedef void (*swapon_fn)(void *, struct swdevt *); typedef void (*swapoff_fn)(void *, struct swdevt *); EVENTHANDLER_DECLARE(swapon, swapon_fn); EVENTHANDLER_DECLARE(swapoff, swapoff_fn); /* newbus device events */ enum evhdev_detach { EVHDEV_DETACH_BEGIN, /* Before detach() is called */ EVHDEV_DETACH_COMPLETE, /* After detach() returns 0 */ EVHDEV_DETACH_FAILED /* After detach() returns err */ }; typedef void (*device_attach_fn)(void *, device_t); typedef void (*device_detach_fn)(void *, device_t, enum evhdev_detach); typedef void (*device_nomatch_fn)(void *, device_t); EVENTHANDLER_DECLARE(device_attach, device_attach_fn); EVENTHANDLER_DECLARE(device_detach, device_detach_fn); EVENTHANDLER_DECLARE(device_nomatch, device_nomatch_fn); /* Interface address addition and removal event */ struct ifaddr; typedef void (*rt_addrmsg_fn)(void *, struct ifaddr *, int); EVENTHANDLER_DECLARE(rt_addrmsg, rt_addrmsg_fn); #endif /* _SYS_EVENTHANDLER_H_ */ diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h index 6e46f40c8ad7..2fd7a0b30412 100644 --- a/sys/sys/protosw.h +++ b/sys/sys/protosw.h @@ -1,359 +1,354 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)protosw.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_PROTOSW_H_ #define _SYS_PROTOSW_H_ #include /* Forward declare these structures referenced from prototypes below. */ struct kaiocb; struct mbuf; struct thread; struct sockaddr; struct socket; struct sockopt; /*#ifdef _KERNEL*/ /* * Protocol switch table. * * Each protocol has a handle initializing one of these structures, * which is used for protocol-protocol and system-protocol communication. * - * The system will call the pr_drain entry if it is low on space and - * this should throw away any non-critical data. - * * In retrospect, it would be a lot nicer to use an interface * similar to the vnode VOP interface. */ struct ifnet; struct stat; struct ucred; struct uio; /* USE THESE FOR YOUR PROTOTYPES ! */ typedef int pr_ctloutput_t(struct socket *, struct sockopt *); -typedef void pr_drain_t(void); typedef void pr_abort_t(struct socket *); typedef int pr_accept_t(struct socket *, struct sockaddr **); typedef int pr_attach_t(struct socket *, int, struct thread *); typedef int pr_bind_t(struct socket *, struct sockaddr *, struct thread *); typedef int pr_connect_t(struct socket *, struct sockaddr *, struct thread *); typedef int pr_connect2_t(struct socket *, struct socket *); typedef int pr_control_t(struct socket *, unsigned long, void *, struct ifnet *, struct thread *); typedef void pr_detach_t(struct socket *); typedef int pr_disconnect_t(struct socket *); typedef int pr_listen_t(struct socket *, int, struct thread *); typedef int pr_peeraddr_t(struct socket *, struct sockaddr **); typedef int pr_rcvd_t(struct socket *, int); typedef int pr_rcvoob_t(struct socket *, struct mbuf *, int); typedef enum { PRUS_OOB = 0x1, PRUS_EOF = 0x2, PRUS_MORETOCOME = 0x4, PRUS_NOTREADY = 0x8, PRUS_IPV6 = 0x10, } pr_send_flags_t; typedef int pr_send_t(struct socket *, int, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); typedef int pr_ready_t(struct socket *, struct mbuf *, int); typedef int pr_sense_t(struct socket *, struct stat *); typedef int pr_shutdown_t(struct socket *); typedef int pr_flush_t(struct socket *, int); typedef int pr_sockaddr_t(struct socket *, struct sockaddr **); typedef int pr_sosend_t(struct socket *, struct sockaddr *, struct uio *, struct mbuf *, struct mbuf *, int, struct thread *); typedef int pr_soreceive_t(struct socket *, struct sockaddr **, struct uio *, struct mbuf **, struct mbuf **, int *); typedef int pr_sopoll_t(struct socket *, int, struct ucred *, struct thread *); typedef void pr_sosetlabel_t(struct socket *); typedef void pr_close_t(struct socket *); typedef int pr_bindat_t(int, struct socket *, struct sockaddr *, struct thread *); typedef int pr_connectat_t(int, struct socket *, struct sockaddr *, struct thread *); typedef int pr_aio_queue_t(struct socket *, struct kaiocb *); struct protosw { short pr_type; /* socket type used for */ struct domain *pr_domain; /* domain protocol a member of */ short pr_protocol; /* protocol number */ short pr_flags; /* see below */ /* protocol-protocol hooks */ pr_ctloutput_t *pr_ctloutput; /* control output (from above) */ /* utility hooks */ - pr_drain_t *pr_drain; /* flush any excess space possible */ struct pr_usrreqs *pr_usrreqs; /* user-protocol hook */ }; /*#endif*/ /* * This number should be defined again within each protocol family to avoid * confusion. */ #define PROTO_SPACER 32767 /* spacer for loadable protocols */ /* * Values for pr_flags. * PR_ADDR requires PR_ATOMIC; * PR_ADDR and PR_CONNREQUIRED are mutually exclusive. * PR_IMPLOPCL means that the protocol allows sendto without prior connect, * and the protocol understands the MSG_EOF flag. The first property is * is only relevant if PR_CONNREQUIRED is set (otherwise sendto is allowed * anyhow). * PR_SOCKBUF requires protocol to initialize and destroy its socket buffers * in its pr_attach and pr_detach. */ #define PR_ATOMIC 0x01 /* exchange atomic messages only */ #define PR_ADDR 0x02 /* addresses given with messages */ #define PR_CONNREQUIRED 0x04 /* connection required by protocol */ #define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */ #define PR_RIGHTS 0x10 /* passes capabilities */ #define PR_IMPLOPCL 0x20 /* implied open/close */ /* was PR_LASTHDR 0x40 enforce ipsec policy; last header */ #define PR_CAPATTACH 0x80 /* socket can attach in cap mode */ #define PR_SOCKBUF 0x100 /* private implementation of buffers */ /* * In earlier BSD network stacks, a single pr_usrreq() function pointer was * invoked with an operation number indicating what operation was desired. * We now provide individual function pointers which protocols can implement, * which offers a number of benefits (such as type checking for arguments). * These older constants are still present in order to support TCP debugging. */ #define PRU_ATTACH 0 /* attach protocol to up */ #define PRU_DETACH 1 /* detach protocol from up */ #define PRU_BIND 2 /* bind socket to address */ #define PRU_LISTEN 3 /* listen for connection */ #define PRU_CONNECT 4 /* establish connection to peer */ #define PRU_ACCEPT 5 /* accept connection from peer */ #define PRU_DISCONNECT 6 /* disconnect from peer */ #define PRU_SHUTDOWN 7 /* won't send any more data */ #define PRU_RCVD 8 /* have taken data; more room now */ #define PRU_SEND 9 /* send this data */ #define PRU_ABORT 10 /* abort (fast DISCONNECT, DETATCH) */ #define PRU_CONTROL 11 /* control operations on protocol */ #define PRU_SENSE 12 /* return status into m */ #define PRU_RCVOOB 13 /* retrieve out of band data */ #define PRU_SENDOOB 14 /* send out of band data */ #define PRU_SOCKADDR 15 /* fetch socket's address */ #define PRU_PEERADDR 16 /* fetch peer's address */ #define PRU_CONNECT2 17 /* connect two sockets */ /* begin for protocols internal use */ #define PRU_FASTTIMO 18 /* 200ms timeout */ #define PRU_SLOWTIMO 19 /* 500ms timeout */ #define PRU_PROTORCV 20 /* receive from below */ #define PRU_PROTOSEND 21 /* send to below */ /* end for protocol's internal use */ #define PRU_SEND_EOF 22 /* send and close */ #define PRU_SOSETLABEL 23 /* MAC label change */ #define PRU_CLOSE 24 /* socket close */ #define PRU_FLUSH 25 /* flush the socket */ #define PRU_NREQ 25 #ifdef PRUREQUESTS const char *prurequests[] = { "ATTACH", "DETACH", "BIND", "LISTEN", "CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN", "RCVD", "SEND", "ABORT", "CONTROL", "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", "PROTORCV", "PROTOSEND", "SEND_EOF", "SOSETLABEL", "CLOSE", "FLUSH", }; #endif #ifdef _KERNEL /* users shouldn't see this decl */ struct ifnet; struct stat; struct ucred; struct uio; /* * If the ordering here looks odd, that's because it's alphabetical. These * should eventually be merged back into struct protosw. * * Some fields initialized to defaults if they are NULL. */ struct pr_usrreqs { pr_abort_t *pru_abort; pr_accept_t *pru_accept; pr_attach_t *pru_attach; pr_bind_t *pru_bind; pr_connect_t *pru_connect; pr_connect2_t *pru_connect2; pr_control_t *pru_control; pr_detach_t *pru_detach; pr_disconnect_t *pru_disconnect; pr_listen_t *pru_listen; pr_peeraddr_t *pru_peeraddr; pr_rcvd_t *pru_rcvd; pr_rcvoob_t *pru_rcvoob; pr_send_t *pru_send; pr_ready_t *pru_ready; pr_sense_t *pru_sense; pr_shutdown_t *pru_shutdown; pr_flush_t *pru_flush; pr_sockaddr_t *pru_sockaddr; pr_sosend_t *pru_sosend; pr_soreceive_t *pru_soreceive; pr_sopoll_t *pru_sopoll; pr_sosetlabel_t *pru_sosetlabel; pr_close_t *pru_close; pr_bindat_t *pru_bindat; pr_connectat_t *pru_connectat; pr_aio_queue_t *pru_aio_queue; }; /* * All nonvoid pru_*() functions below return EOPNOTSUPP. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam); int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job); int pru_attach_notsupp(struct socket *so, int proto, struct thread *td); int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td); int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connect2_notsupp(struct socket *so1, struct socket *so2); int pru_control_notsupp(struct socket *so, u_long cmd, void *data, struct ifnet *ifp, struct thread *td); int pru_disconnect_notsupp(struct socket *so); int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td); int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam); int pru_rcvd_notsupp(struct socket *so, int flags); int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags); int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td); int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count); int pru_sense_null(struct socket *so, struct stat *sb); int pru_shutdown_notsupp(struct socket *so); int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam); int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td); #endif /* _KERNEL */ /* * The arguments to the ctlinput routine are * (*protosw[].pr_ctlinput)(cmd, sa, arg); * where cmd is one of the commands below, sa is a pointer to a sockaddr, * and arg is a `void *' argument used within a protocol family. */ #define PRC_ROUTEDEAD 1 /* select new route if possible ??? */ /* was PRC_QUENCH2 3 DEC congestion bit says slow down */ /* was PRC_QUENCH 4 Deprecated by RFC 6633 */ #define PRC_MSGSIZE 5 /* message size forced drop */ #define PRC_HOSTDEAD 6 /* host appears to be down */ #define PRC_HOSTUNREACH 7 /* deprecated (use PRC_UNREACH_HOST) */ #define PRC_UNREACH_NET 8 /* no route to network */ #define PRC_UNREACH_HOST 9 /* no route to host */ #define PRC_UNREACH_PROTOCOL 10 /* dst says bad protocol */ #define PRC_UNREACH_PORT 11 /* bad port # */ /* was PRC_UNREACH_NEEDFRAG 12 (use PRC_MSGSIZE) */ #define PRC_UNREACH_SRCFAIL 13 /* source route failed */ #define PRC_REDIRECT_NET 14 /* net routing redirect */ #define PRC_REDIRECT_HOST 15 /* host routing redirect */ #define PRC_REDIRECT_TOSNET 16 /* redirect for type of service & net */ #define PRC_REDIRECT_TOSHOST 17 /* redirect for tos & host */ #define PRC_TIMXCEED_INTRANS 18 /* packet lifetime expired in transit */ #define PRC_TIMXCEED_REASS 19 /* lifetime expired on reass q */ #define PRC_PARAMPROB 20 /* header incorrect */ #define PRC_UNREACH_ADMIN_PROHIB 21 /* packet administrativly prohibited */ #define PRC_NCMDS 22 #define PRC_IS_REDIRECT(cmd) \ ((cmd) >= PRC_REDIRECT_NET && (cmd) <= PRC_REDIRECT_TOSHOST) #ifdef PRCREQUESTS char *prcrequests[] = { "IFDOWN", "ROUTEDEAD", "IFUP", "DEC-BIT-QUENCH2", "QUENCH", "MSGSIZE", "HOSTDEAD", "#7", "NET-UNREACH", "HOST-UNREACH", "PROTO-UNREACH", "PORT-UNREACH", "#12", "SRCFAIL-UNREACH", "NET-REDIRECT", "HOST-REDIRECT", "TOSNET-REDIRECT", "TOSHOST-REDIRECT", "TX-INTRANS", "TX-REASS", "PARAMPROB", "ADMIN-UNREACH" }; #endif /* * The arguments to ctloutput are: * (*protosw[].pr_ctloutput)(req, so, level, optname, optval, p); * req is one of the actions listed below, so is a (struct socket *), * level is an indication of which protocol layer the option is intended. * optname is a protocol dependent socket option request, * optval is a pointer to a mbuf-chain pointer, for value-return results. * The protocol is responsible for disposal of the mbuf chain *optval * if supplied, * the caller is responsible for any space held by *optval, when returned. * A non-zero return from ctloutput gives an * UNIX error number which should be passed to higher level software. */ #define PRCO_GETOPT 0 #define PRCO_SETOPT 1 #define PRCO_NCMDS 2 #ifdef PRCOREQUESTS char *prcorequests[] = { "GETOPT", "SETOPT", }; #endif #ifdef _KERNEL struct domain *pffinddomain(int family); struct protosw *pffindproto(int family, int protocol, int type); struct protosw *pffindtype(int family, int type); int pf_proto_register(int family, struct protosw *npr); int pf_proto_unregister(int family, int protocol, int type); #endif #endif diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index 82ba3c81ef1b..63d31dc1d135 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -1,113 +1,114 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ #ifdef _KERNEL /* * Header file for pageout daemon. */ /* * Exported data structures. */ extern u_long vm_page_max_user_wired; extern int vm_pageout_page_count; #define VM_OOM_MEM 1 #define VM_OOM_MEM_PF 2 #define VM_OOM_SWAPZ 3 /* * vm_lowmem flags. */ #define VM_LOW_KMEM 0x01 #define VM_LOW_PAGES 0x02 +#define VM_LOW_MBUFS 0x04 /* * Exported routines. */ /* * Signal pageout-daemon and wait for it. */ void vm_wait(vm_object_t obj); int vm_wait_intr(vm_object_t obj); void vm_waitpfault(struct domainset *, int timo); void vm_wait_domain(int domain); void vm_wait_min(void); void vm_wait_severe(void); int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); void vm_pageout_oom(int shortage); void vm_swapout_run(void); void vm_swapout_run_idle(void); #endif /* _KERNEL */ #endif /* _VM_VM_PAGEOUT_H_ */