diff --git a/sys/compat/linuxkpi/common/src/linux_shrinker.c b/sys/compat/linuxkpi/common/src/linux_shrinker.c index 18200fa0bc01..e06490b92ed1 100644 --- a/sys/compat/linuxkpi/common/src/linux_shrinker.c +++ b/sys/compat/linuxkpi/common/src/linux_shrinker.c @@ -1,156 +1,156 @@ /*- * Copyright (c) 2020 Emmanuel Vadot * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include TAILQ_HEAD(, shrinker) lkpi_shrinkers = TAILQ_HEAD_INITIALIZER(lkpi_shrinkers); static struct sx sx_shrinker; struct shrinker * linuxkpi_shrinker_alloc(unsigned int flags, const char *fmt, ...) { struct shrinker *shrinker; shrinker = kzalloc(sizeof(*shrinker), GFP_KERNEL); if (shrinker == NULL) return (NULL); shrinker->flags = flags | SHRINKER_ALLOCATED; shrinker->seeks = DEFAULT_SEEKS; return (shrinker); } int linuxkpi_register_shrinker(struct shrinker *s) { KASSERT(s != NULL, ("NULL shrinker")); KASSERT(s->count_objects != NULL, ("NULL shrinker")); KASSERT(s->scan_objects != NULL, ("NULL shrinker")); sx_xlock(&sx_shrinker); s->flags |= SHRINKER_REGISTERED; TAILQ_INSERT_TAIL(&lkpi_shrinkers, s, next); sx_xunlock(&sx_shrinker); return (0); } void linuxkpi_unregister_shrinker(struct shrinker *s) { sx_xlock(&sx_shrinker); TAILQ_REMOVE(&lkpi_shrinkers, s, next); s->flags &= ~SHRINKER_REGISTERED; sx_xunlock(&sx_shrinker); } void linuxkpi_shrinker_free(struct shrinker *shrinker) { if (shrinker->flags & SHRINKER_REGISTERED) unregister_shrinker(shrinker); kfree(shrinker); } void linuxkpi_synchronize_shrinkers(void) { sx_xlock(&sx_shrinker); sx_xunlock(&sx_shrinker); } #define SHRINKER_BATCH 512 static void shrinker_shrink(struct shrinker *s) { struct shrink_control sc; unsigned long can_free; unsigned long batch; unsigned long scanned = 0; unsigned long ret; can_free = s->count_objects(s, &sc); if (can_free <= 0) return; batch = s->batch ? s->batch : SHRINKER_BATCH; while (scanned <= can_free) { sc.nr_to_scan = batch; ret = s->scan_objects(s, &sc); if (ret == SHRINK_STOP) break; scanned += batch; } } static void -linuxkpi_vm_lowmem(void *arg __unused) +linuxkpi_vm_lowmem(void *arg __unused, int flags __unused) { struct shrinker *s; sx_xlock(&sx_shrinker); TAILQ_FOREACH(s, &lkpi_shrinkers, next) { shrinker_shrink(s); } sx_xunlock(&sx_shrinker); } static eventhandler_tag lowmem_tag; static void linuxkpi_sysinit_shrinker(void *arg __unused) { sx_init(&sx_shrinker, "lkpi-shrinker"); lowmem_tag = EVENTHANDLER_REGISTER(vm_lowmem, linuxkpi_vm_lowmem, NULL, EVENTHANDLER_PRI_FIRST); } static void linuxkpi_sysuninit_shrinker(void *arg __unused) { sx_destroy(&sx_shrinker); EVENTHANDLER_DEREGISTER(vm_lowmem, lowmem_tag); } SYSINIT(linuxkpi_shrinker, SI_SUB_DRIVERS, SI_ORDER_ANY, linuxkpi_sysinit_shrinker, NULL); SYSUNINIT(linuxkpi_shrinker, SI_SUB_DRIVERS, SI_ORDER_ANY, linuxkpi_sysuninit_shrinker, NULL); diff --git a/sys/dev/drm2/ttm/ttm_page_alloc.c b/sys/dev/drm2/ttm/ttm_page_alloc.c index 7518ecb4dfd1..724ba0bfb4d8 100644 --- a/sys/dev/drm2/ttm/ttm_page_alloc.c +++ b/sys/dev/drm2/ttm/ttm_page_alloc.c @@ -1,922 +1,922 @@ /* * Copyright (c) Red Hat Inc. * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Authors: Dave Airlie * Jerome Glisse * Pauli Nieminen */ /* * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ /* simple list based uncached page pool * - Pool collects resently freed pages for reuse * - Use page->lru to keep a free list * - doesn't track currently in use pages */ #include #include #include #include #include #include #define NUM_PAGES_TO_ALLOC (PAGE_SIZE/sizeof(vm_page_t)) #define SMALL_ALLOCATION 16 #define FREE_ALL_PAGES (~0U) /* times are in msecs */ #define PAGE_FREE_INTERVAL 1000 /** * struct ttm_page_pool - Pool to reuse recently allocated uc/wc pages. * * @lock: Protects the shared pool from concurrnet access. Must be used with * irqsave/irqrestore variants because pool allocator maybe called from * delayed work. * @fill_lock: Prevent concurrent calls to fill. * @list: Pool of free uc/wc pages for fast reuse. * @gfp_flags: Flags to pass for alloc_page. * @npages: Number of pages in pool. */ struct ttm_page_pool { struct mtx lock; bool fill_lock; bool dma32; struct pglist list; int ttm_page_alloc_flags; unsigned npages; char *name; unsigned long nfrees; unsigned long nrefills; }; /** * Limits for the pool. They are handled without locks because only place where * they may change is in sysfs store. They won't have immediate effect anyway * so forcing serialization to access them is pointless. */ struct ttm_pool_opts { unsigned alloc_size; unsigned max_size; unsigned small; }; #define NUM_POOLS 4 /** * struct ttm_pool_manager - Holds memory pools for fst allocation * * Manager is read only object for pool code so it doesn't need locking. * * @free_interval: minimum number of jiffies between freeing pages from pool. * @page_alloc_inited: reference counting for pool allocation. * @work: Work that is used to shrink the pool. Work is only run when there is * some pages to free. * @small_allocation: Limit in number of pages what is small allocation. * * @pools: All pool objects in use. **/ struct ttm_pool_manager { unsigned int kobj_ref; eventhandler_tag lowmem_handler; struct ttm_pool_opts options; union { struct ttm_page_pool u_pools[NUM_POOLS]; struct _utag { struct ttm_page_pool u_wc_pool; struct ttm_page_pool u_uc_pool; struct ttm_page_pool u_wc_pool_dma32; struct ttm_page_pool u_uc_pool_dma32; } _ut; } _u; }; #define pools _u.u_pools #define wc_pool _u._ut.u_wc_pool #define uc_pool _u._ut.u_uc_pool #define wc_pool_dma32 _u._ut.u_wc_pool_dma32 #define uc_pool_dma32 _u._ut.u_uc_pool_dma32 MALLOC_DEFINE(M_TTM_POOLMGR, "ttm_poolmgr", "TTM Pool Manager"); static void ttm_vm_page_free(vm_page_t m) { KASSERT(m->object == NULL, ("ttm page %p is owned", m)); KASSERT(vm_page_wired(m), ("ttm lost wire %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("ttm lost fictitious %p", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("ttm got unmanaged %p", m)); m->flags &= ~PG_FICTITIOUS; m->oflags |= VPO_UNMANAGED; vm_page_unwire_noq(m); vm_page_free(m); } static vm_memattr_t ttm_caching_state_to_vm(enum ttm_caching_state cstate) { switch (cstate) { case tt_uncached: return (VM_MEMATTR_UNCACHEABLE); case tt_wc: return (VM_MEMATTR_WRITE_COMBINING); case tt_cached: return (VM_MEMATTR_WRITE_BACK); } panic("caching state %d\n", cstate); } static vm_page_t ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr) { vm_page_t p; int err, tries; for (tries = 0; ; tries++) { p = vm_page_alloc_noobj_contig(req, 1, 0, 0xffffffff, PAGE_SIZE, 0, memattr); if (p != NULL || tries > 2) return (p); err = vm_page_reclaim_contig(req, 1, 0, 0xffffffff, PAGE_SIZE, 0); if (err == ENOMEM) vm_wait(NULL); else if (err != 0) return (NULL); } } static vm_page_t ttm_vm_page_alloc_any(int req, vm_memattr_t memattr) { vm_page_t p; p = vm_page_alloc_noobj(req | VM_ALLOC_WAITOK); pmap_page_set_memattr(p, memattr); return (p); } static vm_page_t ttm_vm_page_alloc(int flags, enum ttm_caching_state cstate) { vm_page_t p; vm_memattr_t memattr; int req; memattr = ttm_caching_state_to_vm(cstate); req = VM_ALLOC_WIRED; if ((flags & TTM_PAGE_FLAG_ZERO_ALLOC) != 0) req |= VM_ALLOC_ZERO; if ((flags & TTM_PAGE_FLAG_DMA32) != 0) p = ttm_vm_page_alloc_dma32(req, memattr); else p = ttm_vm_page_alloc_any(req, memattr); if (p != NULL) { p->oflags &= ~VPO_UNMANAGED; p->flags |= PG_FICTITIOUS; } return (p); } static void ttm_pool_kobj_release(struct ttm_pool_manager *m) { free(m, M_TTM_POOLMGR); } #if 0 /* XXXKIB sysctl */ static ssize_t ttm_pool_store(struct ttm_pool_manager *m, struct attribute *attr, const char *buffer, size_t size) { int chars; unsigned val; chars = sscanf(buffer, "%u", &val); if (chars == 0) return size; /* Convert kb to number of pages */ val = val / (PAGE_SIZE >> 10); if (attr == &ttm_page_pool_max) m->options.max_size = val; else if (attr == &ttm_page_pool_small) m->options.small = val; else if (attr == &ttm_page_pool_alloc_size) { if (val > NUM_PAGES_TO_ALLOC*8) { pr_err("Setting allocation size to %lu is not allowed. Recommended size is %lu\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 7), NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); return size; } else if (val > NUM_PAGES_TO_ALLOC) { pr_warn("Setting allocation size to larger than %lu is not recommended\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); } m->options.alloc_size = val; } return size; } static ssize_t ttm_pool_show(struct ttm_pool_manager *m, struct attribute *attr, char *buffer) { unsigned val = 0; if (attr == &ttm_page_pool_max) val = m->options.max_size; else if (attr == &ttm_page_pool_small) val = m->options.small; else if (attr == &ttm_page_pool_alloc_size) val = m->options.alloc_size; val = val * (PAGE_SIZE >> 10); return snprintf(buffer, PAGE_SIZE, "%u\n", val); } #endif static struct ttm_pool_manager *_manager; static int set_pages_array_wb(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_BACK); #endif return 0; } static int set_pages_array_wc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_COMBINING); #endif return 0; } static int set_pages_array_uc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_UNCACHEABLE); #endif return 0; } /** * Select the right pool or requested caching state and ttm flags. */ static struct ttm_page_pool *ttm_get_pool(int flags, enum ttm_caching_state cstate) { int pool_index; if (cstate == tt_cached) return NULL; if (cstate == tt_wc) pool_index = 0x0; else pool_index = 0x1; if (flags & TTM_PAGE_FLAG_DMA32) pool_index |= 0x2; return &_manager->pools[pool_index]; } /* set memory back to wb and free the pages. */ static void ttm_pages_put(vm_page_t *pages, unsigned npages) { unsigned i; /* Our VM handles vm memattr automatically on the page free. */ if (set_pages_array_wb(pages, npages)) printf("[TTM] Failed to set %d pages to wb!\n", npages); for (i = 0; i < npages; ++i) ttm_vm_page_free(pages[i]); } static void ttm_pool_update_free_locked(struct ttm_page_pool *pool, unsigned freed_pages) { pool->npages -= freed_pages; pool->nfrees += freed_pages; } /** * Free pages from pool. * * To prevent hogging the ttm_swap process we only free NUM_PAGES_TO_ALLOC * number of pages in one go. * * @pool: to free the pages from * @free_all: If set to true will free all pages in pool **/ static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free) { vm_page_t p, p1; vm_page_t *pages_to_free; unsigned freed_pages = 0, npages_to_free = nr_free; unsigned i; if (NUM_PAGES_TO_ALLOC < nr_free) npages_to_free = NUM_PAGES_TO_ALLOC; pages_to_free = malloc(npages_to_free * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); restart: mtx_lock(&pool->lock); TAILQ_FOREACH_REVERSE_SAFE(p, &pool->list, pglist, plinks.q, p1) { if (freed_pages >= npages_to_free) break; pages_to_free[freed_pages++] = p; /* We can only remove NUM_PAGES_TO_ALLOC at a time. */ if (freed_pages >= NUM_PAGES_TO_ALLOC) { /* remove range of pages from the pool */ for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); /** * Because changing page caching is costly * we unlock the pool to prevent stalling. */ mtx_unlock(&pool->lock); ttm_pages_put(pages_to_free, freed_pages); if (likely(nr_free != FREE_ALL_PAGES)) nr_free -= freed_pages; if (NUM_PAGES_TO_ALLOC >= nr_free) npages_to_free = nr_free; else npages_to_free = NUM_PAGES_TO_ALLOC; freed_pages = 0; /* free all so restart the processing */ if (nr_free) goto restart; /* Not allowed to fall through or break because * following context is inside spinlock while we are * outside here. */ goto out; } } /* remove range of pages from the pool */ if (freed_pages) { for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); nr_free -= freed_pages; } mtx_unlock(&pool->lock); if (freed_pages) ttm_pages_put(pages_to_free, freed_pages); out: free(pages_to_free, M_TEMP); return nr_free; } /* Get good estimation how many pages are free in pools */ static int ttm_pool_get_num_unused_pages(void) { unsigned i; int total = 0; for (i = 0; i < NUM_POOLS; ++i) total += _manager->pools[i].npages; return total; } /** * Callback for mm to request pool to reduce number of page held. */ -static int ttm_pool_mm_shrink(void *arg) +static int ttm_pool_mm_shrink(void *arg, int flags __unused) { static unsigned int start_pool = 0; unsigned i; unsigned pool_offset = atomic_fetchadd_int(&start_pool, 1); struct ttm_page_pool *pool; int shrink_pages = 100; /* XXXKIB */ pool_offset = pool_offset % NUM_POOLS; /* select start pool in round robin fashion */ for (i = 0; i < NUM_POOLS; ++i) { unsigned nr_free = shrink_pages; if (shrink_pages == 0) break; pool = &_manager->pools[(i + pool_offset)%NUM_POOLS]; shrink_pages = ttm_page_pool_free(pool, nr_free); } /* return estimated number of unused pages in pool */ return ttm_pool_get_num_unused_pages(); } static void ttm_pool_mm_shrink_init(struct ttm_pool_manager *manager) { manager->lowmem_handler = EVENTHANDLER_REGISTER(vm_lowmem, ttm_pool_mm_shrink, manager, EVENTHANDLER_PRI_ANY); } static void ttm_pool_mm_shrink_fini(struct ttm_pool_manager *manager) { EVENTHANDLER_DEREGISTER(vm_lowmem, manager->lowmem_handler); } static int ttm_set_pages_caching(vm_page_t *pages, enum ttm_caching_state cstate, unsigned cpages) { int r = 0; /* Set page caching */ switch (cstate) { case tt_uncached: r = set_pages_array_uc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to uc!\n", cpages); break; case tt_wc: r = set_pages_array_wc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to wc!\n", cpages); break; default: break; } return r; } /** * Free pages the pages that failed to change the caching state. If there is * any pages that have changed their caching state already put them to the * pool. */ static void ttm_handle_caching_state_failure(struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, vm_page_t *failed_pages, unsigned cpages) { unsigned i; /* Failed pages have to be freed */ for (i = 0; i < cpages; ++i) { TAILQ_REMOVE(pages, failed_pages[i], plinks.q); ttm_vm_page_free(failed_pages[i]); } } /** * Allocate new pages with correct caching. * * This function is reentrant if caller updates count depending on number of * pages returned in pages array. */ static int ttm_alloc_new_pages(struct pglist *pages, int ttm_alloc_flags, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t *caching_array; vm_page_t p; int r = 0; unsigned i, cpages; unsigned max_cpages = min(count, (unsigned)(PAGE_SIZE/sizeof(vm_page_t))); /* allocate array for page caching change */ caching_array = malloc(max_cpages * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); for (i = 0, cpages = 0; i < count; ++i) { p = ttm_vm_page_alloc(ttm_alloc_flags, cstate); if (!p) { printf("[TTM] Unable to get page %u\n", i); /* store already allocated pages in the pool after * setting the caching state */ if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } r = -ENOMEM; goto out; } #ifdef CONFIG_HIGHMEM /* KIB: nop */ /* gfp flags of highmem page should never be dma32 so we * we should be fine in such case */ if (!PageHighMem(p)) #endif { caching_array[cpages++] = p; if (cpages == max_cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) { ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); goto out; } cpages = 0; } } TAILQ_INSERT_HEAD(pages, p, plinks.q); } if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } out: free(caching_array, M_TEMP); return r; } /** * Fill the given pool if there aren't enough pages and the requested number of * pages is small. */ static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; int r; unsigned cpages = 0; /** * Only allow one pool fill operation at a time. * If pool doesn't have enough pages for the allocation new pages are * allocated from outside of pool. */ if (pool->fill_lock) return; pool->fill_lock = true; /* If allocation request is small and there are not enough * pages in a pool we fill the pool up first. */ if (count < _manager->options.small && count > pool->npages) { struct pglist new_pages; unsigned alloc_size = _manager->options.alloc_size; /** * Can't change page caching if in irqsave context. We have to * drop the pool->lock. */ mtx_unlock(&pool->lock); TAILQ_INIT(&new_pages); r = ttm_alloc_new_pages(&new_pages, pool->ttm_page_alloc_flags, ttm_flags, cstate, alloc_size); mtx_lock(&pool->lock); if (!r) { TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); ++pool->nrefills; pool->npages += alloc_size; } else { printf("[TTM] Failed to fill pool (%p)\n", pool); /* If we have any pages left put them to the pool. */ TAILQ_FOREACH(p, &pool->list, plinks.q) { ++cpages; } TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); pool->npages += cpages; } } pool->fill_lock = false; } /** * Cut 'count' number of pages from the pool and put them on the return list. * * @return count of pages still required to fulfill the request. */ static unsigned ttm_page_pool_get_pages(struct ttm_page_pool *pool, struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; unsigned i; mtx_lock(&pool->lock); ttm_page_pool_fill_locked(pool, ttm_flags, cstate, count); if (count >= pool->npages) { /* take all pages from the pool */ TAILQ_CONCAT(pages, &pool->list, plinks.q); count -= pool->npages; pool->npages = 0; goto out; } for (i = 0; i < count; i++) { p = TAILQ_FIRST(&pool->list); TAILQ_REMOVE(&pool->list, p, plinks.q); TAILQ_INSERT_TAIL(pages, p, plinks.q); } pool->npages -= count; count = 0; out: mtx_unlock(&pool->lock); return count; } /* Put all pages in pages list to correct pool to wait for reuse */ static void ttm_put_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); unsigned i; if (pool == NULL) { /* No pool for this memory type so free the pages */ for (i = 0; i < npages; i++) { if (pages[i]) { ttm_vm_page_free(pages[i]); pages[i] = NULL; } } return; } mtx_lock(&pool->lock); for (i = 0; i < npages; i++) { if (pages[i]) { TAILQ_INSERT_TAIL(&pool->list, pages[i], plinks.q); pages[i] = NULL; pool->npages++; } } /* Check that we don't go over the pool limit */ npages = 0; if (pool->npages > _manager->options.max_size) { npages = pool->npages - _manager->options.max_size; /* free at least NUM_PAGES_TO_ALLOC number of pages * to reduce calls to set_memory_wb */ if (npages < NUM_PAGES_TO_ALLOC) npages = NUM_PAGES_TO_ALLOC; } mtx_unlock(&pool->lock); if (npages) ttm_page_pool_free(pool, npages); } /* * On success pages list will hold count number of correctly * cached pages. */ static int ttm_get_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); struct pglist plist; vm_page_t p = NULL; int gfp_flags; unsigned count; int r; /* No pool for cached pages */ if (pool == NULL) { for (r = 0; r < npages; ++r) { p = ttm_vm_page_alloc(flags, cstate); if (!p) { printf("[TTM] Unable to allocate page\n"); return -ENOMEM; } pages[r] = p; } return 0; } /* combine zero flag to pool flags */ gfp_flags = flags | pool->ttm_page_alloc_flags; /* First we take pages from the pool */ TAILQ_INIT(&plist); npages = ttm_page_pool_get_pages(pool, &plist, flags, cstate, npages); count = 0; TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } /* clear the pages coming from the pool if requested */ if (flags & TTM_PAGE_FLAG_ZERO_ALLOC) { TAILQ_FOREACH(p, &plist, plinks.q) { pmap_zero_page(p); } } /* If pool didn't have enough pages allocate new one. */ if (npages > 0) { /* ttm_alloc_new_pages doesn't reference pool so we can run * multiple requests in parallel. **/ TAILQ_INIT(&plist); r = ttm_alloc_new_pages(&plist, gfp_flags, flags, cstate, npages); TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } if (r) { /* If there is any pages in the list put them back to * the pool. */ printf("[TTM] Failed to allocate extra pages for large request\n"); ttm_put_pages(pages, count, flags, cstate); return r; } } return 0; } static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, int flags, char *name) { mtx_init(&pool->lock, "ttmpool", NULL, MTX_DEF); pool->fill_lock = false; TAILQ_INIT(&pool->list); pool->npages = pool->nfrees = 0; pool->ttm_page_alloc_flags = flags; pool->name = name; } int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages) { if (_manager != NULL) printf("[TTM] manager != NULL\n"); printf("[TTM] Initializing pool allocator\n"); _manager = malloc(sizeof(*_manager), M_TTM_POOLMGR, M_WAITOK | M_ZERO); ttm_page_pool_init_locked(&_manager->wc_pool, 0, "wc"); ttm_page_pool_init_locked(&_manager->uc_pool, 0, "uc"); ttm_page_pool_init_locked(&_manager->wc_pool_dma32, TTM_PAGE_FLAG_DMA32, "wc dma"); ttm_page_pool_init_locked(&_manager->uc_pool_dma32, TTM_PAGE_FLAG_DMA32, "uc dma"); _manager->options.max_size = max_pages; _manager->options.small = SMALL_ALLOCATION; _manager->options.alloc_size = NUM_PAGES_TO_ALLOC; refcount_init(&_manager->kobj_ref, 1); ttm_pool_mm_shrink_init(_manager); return 0; } void ttm_page_alloc_fini(void) { int i; printf("[TTM] Finalizing pool allocator\n"); ttm_pool_mm_shrink_fini(_manager); for (i = 0; i < NUM_POOLS; ++i) ttm_page_pool_free(&_manager->pools[i], FREE_ALL_PAGES); if (refcount_release(&_manager->kobj_ref)) ttm_pool_kobj_release(_manager); _manager = NULL; } int ttm_pool_populate(struct ttm_tt *ttm) { struct ttm_mem_global *mem_glob = ttm->glob->mem_glob; unsigned i; int ret; if (ttm->state != tt_unpopulated) return 0; for (i = 0; i < ttm->num_pages; ++i) { ret = ttm_get_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); if (ret != 0) { ttm_pool_unpopulate(ttm); return -ENOMEM; } ret = ttm_mem_global_alloc_page(mem_glob, ttm->pages[i], false, false); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return -ENOMEM; } } if (unlikely(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)) { ret = ttm_tt_swapin(ttm); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return ret; } } ttm->state = tt_unbound; return 0; } void ttm_pool_unpopulate(struct ttm_tt *ttm) { unsigned i; for (i = 0; i < ttm->num_pages; ++i) { if (ttm->pages[i]) { ttm_mem_global_free_page(ttm->glob->mem_glob, ttm->pages[i]); ttm_put_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); } } ttm->state = tt_unpopulated; } #if 0 /* XXXKIB sysctl */ int ttm_page_alloc_debugfs(struct seq_file *m, void *data) { struct ttm_page_pool *p; unsigned i; char *h[] = {"pool", "refills", "pages freed", "size"}; if (!_manager) { seq_printf(m, "No pool allocator running.\n"); return 0; } seq_printf(m, "%6s %12s %13s %8s\n", h[0], h[1], h[2], h[3]); for (i = 0; i < NUM_POOLS; ++i) { p = &_manager->pools[i]; seq_printf(m, "%6s %12ld %13ld %8d\n", p->name, p->nrefills, p->nfrees, p->npages); } return 0; } #endif diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 51bbdeaaa55e..f6d0201f9b37 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,2105 +1,2105 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "Location of process' ps_strings structure"); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "Top of process stack"); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", "Stack memory permissions"); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, "Process' command line characters cache limit"); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int core_dump_can_intr = 1; SYSCTL_INT(_kern, OID_AUTO, core_dump_can_intr, CTLFLAG_RWTUN, &core_dump_can_intr, 0, "Core dumping interruptible with SIGKILL"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t ps_strings; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &val, sizeof(val))); } #endif ps_strings = PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings))); } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t val; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val32; val32 = round_page((unsigned int)p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val32, sizeof(val32))); } #endif val = round_page(p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val, sizeof(val))); } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; }; #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL, oldvmspace); } post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } exec_cleanup(td, oldvmspace); } /* * kern_execve() has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { TSEXEC(td->td_proc->p_pid, args->begin_argv); AUDIT_ARG_ARGV(args->begin_argv, args->argc, exec_args_get_begin_envv(args) - args->begin_argv); AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc, args->endp - exec_args_get_begin_envv(args)); #ifdef KTRACE if (KTRPOINT(td, KTR_ARGS)) { ktrdata(KTR_ARGS, args->begin_argv, exec_args_get_begin_envv(args) - args->begin_argv); } if (KTRPOINT(td, KTR_ENVS)) { ktrdata(KTR_ENVS, exec_args_get_begin_envv(args), args->endp - exec_args_get_begin_envv(args)); } #endif /* Must have at least one argument. */ if (args->argc == 0) { exec_free_args(args); return (EINVAL); } return (do_execve(td, args, mac_p, oldvmspace)); } static void execve_nosetid(struct image_params *imgp) { imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; uintptr_t stack_base; struct image_params image_params, *imgp; struct vattr attr; struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct ktr_io_params *kiop; #endif struct vnode *oldtextvp, *newtextvp; struct vnode *oldtextdvp, *newtextdvp; char *oldbinname, *newbinname; bool credential_changing; #ifdef MAC struct label *interpvplabel = NULL; bool will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif int error, i, orig_osrel; uint32_t orig_fctl0; Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; oldtextvp = oldtextdvp = NULL; newtextvp = newtextdvp = NULL; newbinname = oldbinname = NULL; #ifdef KTRACE kiop = NULL; #endif /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; orig_osrel = p->p_osrel; orig_fctl0 = p->p_fctl0; orig_brandinfo = p->p_elf_brandinfo; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_NAMEI, args->fname); /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif /* * Translate the file name. namei() returns a vnode * pointer in ni_vp among other things. */ NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | AUDITVNODE1 | WANTPARENT, UIO_SYSSPACE, args->fname); error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; newtextdvp = nd.ni_dvp; nd.ni_dvp = NULL; newbinname = malloc(nd.ni_cnd.cn_namelen + 1, M_PARGS, M_WAITOK); memcpy(newbinname, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen); newbinname[nd.ni_cnd.cn_namelen] = '\0'; imgp->vp = newtextvp; /* * Do the best to calculate the full path to the image file. */ if (args->fname[0] == '/') { imgp->execpath = args->fname; } else { VOP_UNLOCK(imgp->vp); freepath_size = MAXPATHLEN; if (vn_fullpath_hardlink(newtextvp, newtextdvp, newbinname, nd.ni_cnd.cn_namelen, &imgp->execpath, &imgp->freepath, &freepath_size) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } } else if (imgp->interpreter_vp) { /* * An image activator has already provided an open vnode */ newtextvp = imgp->interpreter_vp; imgp->interpreter_vp = NULL; if (vn_fullpath(newtextvp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * If the descriptors was not opened with O_PATH, then * we require that it was opened with O_EXEC or * O_RDONLY. In either case, exec_check_permissions() * below checks _current_ file access mode regardless * of the permissions additionally checked at the * open(2). */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error != 0) goto exec_fail; if (vn_fullpath(newtextvp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions. Also 'opens' file and sets its vnode to * text mode. */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; imgp->proc->p_elf_brandinfo = NULL; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = false; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp) != 0; credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ error = -1; for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL) continue; error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) error = ENOEXEC; goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * The text reference needs to be removed for scripts. * There is a short period before we determine that * something is a script where text reference is active. * The vnode lock is held over this entire period * so nothing should illegitimately be blocked. */ MPASS(imgp->textset); VOP_UNSET_TEXT_CHECKED(newtextvp); imgp->textset = false; /* free name buffer and old vnode */ #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = false; } vput(newtextvp); imgp->vp = newtextvp = NULL; if (args->fname != NULL) { if (newtextdvp != NULL) { vrele(newtextdvp); newtextdvp = NULL; } NDFREE_PNBUF(&nd); free(newbinname, M_PARGS); newbinname = NULL; } vm_object_deallocate(imgp->object); imgp->object = NULL; execve_nosetid(imgp); imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ if (imgp->interpreter_vp) { args->fname = NULL; } else { args->fname = imgp->interpreter_name; } goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base. */ error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * For security and other reasons, the file descriptor table cannot be * shared after an exec. */ fdunshare(td); pdunshare(td); /* close files on exec */ fdcloseexec(td); /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) p->p_flag2 &= ~P2_STKGAP_DISABLE; p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE | P2_MEMBAR_GLOBE); if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } if ((imgp->sysent->sv_setid_allowed != NULL && !(*imgp->sysent->sv_setid_allowed)(td, imgp)) || (p->p_flag2 & P2_NO_NEW_PRIVS) != 0) execve_nosetid(imgp); /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE kiop = ktrprocexec(p); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in kern.proc.pathname. This vnode was * referenced by namei() or by fexecve variant of fname handling. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; oldtextdvp = p->p_textdvp; p->p_textdvp = newtextdvp; newtextdvp = NULL; oldbinname = p->p_binname; p->p_binname = newbinname; newbinname = NULL; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp); pe.pm_credentialschanged = credential_changing; pe.pm_baseaddr = imgp->reloc_base; pe.pm_dynaddr = imgp->et_dyn_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, stack_base); VOP_MMAPPED(imgp->vp); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (error != 0) { p->p_osrel = orig_osrel; p->p_fctl0 = orig_fctl0; p->p_elf_brandinfo = orig_brandinfo; } if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (imgp->textset) VOP_UNSET_TEXT_CHECKED(imgp->vp); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp); if (args->fname != NULL) NDFREE_PNBUF(&nd); if (newtextdvp != NULL) vrele(newtextdvp); free(newbinname, M_PARGS); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); if (oldtextdvp != NULL) vrele(oldtextdvp); free(oldbinname, M_PARGS); #ifdef KTRACE ktr_io_params_free(kiop); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exec_cleanup(td, oldvmspace); exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } void exec_cleanup(struct thread *td, struct vmspace *oldvmspace) { if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(td->td_proc->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } int exec_map_first_page(struct image_params *imgp) { vm_object_t object; vm_page_t m; int error; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif error = vm_page_grab_valid_unlocked(&m, object, 0, VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) | VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (error != VM_PAGER_OK) return (EIO); imgp->firstpage = sf_buf_alloc(m, 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_unwire(m, PQ_ACTIVE); } } void exec_onexec_old(struct thread *td) { sigfastblock_clear(td); umtx_exec(td->td_proc); } /* * This is an optimization which removes the unmanaged shared page * mapping. In combination with pmap_remove_pages(), which cleans all * managed mappings in the process' vmspace pmap, no work will be left * for pmap_remove(min, max). */ void exec_free_abi_mappings(struct proc *p) { struct vmspace *vmspace; vmspace = p->p_vmspace; if (refcount_load(&vmspace->vm_refcnt) != 1) return; if (!PROC_HAS_SHP(p)) return; pmap_remove(vmspace_pmap(vmspace), vmspace->vm_shp_base, vmspace->vm_shp_base + p->p_sysent->sv_shared_page_len); } /* * Run down the current address space and install a new one. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; struct thread *td = curthread; vm_offset_t sv_minuser; vm_map_t map; imgp->vmspace_destroyed = true; imgp->sysent = sv; if (p->p_sysent->sv_onexec_old != NULL) p->p_sysent->sv_onexec_old(td); itimers_exec(p); EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (refcount_load(&vmspace->vm_refcnt) == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { exec_free_abi_mappings(p); shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE). * ASLR and W^X states must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0); } /* * Compute the stack size limit and map the main process stack. * Map the shared page. */ int exec_map_stack(struct image_params *imgp) { struct rlimit rlim_stack; struct sysentvec *sv; struct proc *p; vm_map_t map; struct vmspace *vmspace; vm_offset_t stack_addr, stack_top; vm_offset_t sharedpage_addr; u_long ssiz; int error, find_space, stack_off; vm_prot_t stack_prot; vm_object_t obj; p = imgp->proc; sv = p->p_sysent; if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } vmspace = p->p_vmspace; map = &vmspace->vm_map; stack_prot = sv->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot; if ((map->flags & MAP_ASLR_STACK) != 0) { stack_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); find_space = VMFS_ANY_SPACE; } else { stack_addr = sv->sv_usrstack - ssiz; find_space = VMFS_NO_SPACE; } error = vm_map_find(map, NULL, 0, &stack_addr, (vm_size_t)ssiz, sv->sv_usrstack, find_space, stack_prot, VM_PROT_ALL, MAP_STACK_AREA); if (error != KERN_SUCCESS) { uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x " "failed, mach error %d errno %d\n", (uintmax_t)ssiz, stack_prot, error, vm_mmap_to_errno(error)); return (vm_mmap_to_errno(error)); } stack_top = stack_addr + ssiz; if ((map->flags & MAP_ASLR_STACK) != 0) { /* Randomize within the first page of the stack. */ arc4rand(&stack_off, sizeof(stack_off), 0); stack_top -= rounddown2(stack_off & PAGE_MASK, sizeof(void *)); } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj == NULL) { sharedpage_addr = 0; goto out; } /* * If randomization is disabled then the shared page will * be mapped at address specified in sysentvec. * Otherwise any address above .data section can be selected. * Same logic is used for stack address randomization. * If the address randomization is applied map a guard page * at the top of UVA. */ vm_object_reference(obj); if ((imgp->imgp_flags & IMGP_ASLR_SHARED_PAGE) != 0) { sharedpage_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); error = vm_map_fixed(map, NULL, 0, sv->sv_maxuser - PAGE_SIZE, PAGE_SIZE, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD); if (error != KERN_SUCCESS) { /* * This is not fatal, so let's just print a warning * and continue. */ uprintf("%s: Mapping guard page at the top of UVA failed" " mach error %d errno %d", __func__, error, vm_mmap_to_errno(error)); } error = vm_map_find(map, obj, 0, &sharedpage_addr, sv->sv_shared_page_len, sv->sv_maxuser, VMFS_ANY_SPACE, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } if (error != KERN_SUCCESS) { uprintf("%s: mapping shared page at addr: %p" "failed, mach error %d errno %d\n", __func__, (void *)sharedpage_addr, error, vm_mmap_to_errno(error)); vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } out: /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_maxsaddr = (char *)stack_addr; vmspace->vm_stacktop = stack_top; vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_shp_base = sharedpage_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, const char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long arg, env; int error; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ error = exec_args_add_fname(args, fname, segflg); if (error != 0) goto err_exit; /* * extract arguments first */ for (;;) { error = fueword(argv++, &arg); if (error == -1) { error = EFAULT; goto err_exit; } if (arg == 0) break; error = exec_args_add_arg(args, (char *)(uintptr_t)arg, UIO_USERSPACE); if (error != 0) goto err_exit; } /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &env); if (error == -1) { error = EFAULT; goto err_exit; } if (env == 0) break; error = exec_args_add_env(args, (char *)(uintptr_t)env, UIO_USERSPACE); if (error != 0) goto err_exit; } } return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } kasan_mark((void *)argkva->addr, exec_map_entry_size, exec_map_entry_size, 0); *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; kasan_mark((void *)argkva->addr, 0, exec_map_entry_size, KASAN_EXEC_ARGS_FREED); if (argkva->gen != gen) { (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void -exec_args_kva_lowmem(void *arg __unused) +exec_args_kva_lowmem(void *arg __unused, int flags __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } } /* * A set to functions to fill struct image args. * * NOTE: exec_args_add_fname() must be called (possibly with a NULL * fname) before the other functions. All exec_args_add_arg() calls must * be made before any exec_args_add_env() calls. exec_args_adjust_args() * may be called any time after exec_args_add_fname(). * * exec_args_add_fname() - install path to be executed * exec_args_add_arg() - append an argument string * exec_args_add_env() - append an env string * exec_args_adjust_args() - adjust location of the argument list to * allow new arguments to be prepended */ int exec_args_add_fname(struct image_args *args, const char *fname, enum uio_seg segflg) { int error; size_t length; KASSERT(args->fname == NULL, ("fname already appended")); KASSERT(args->endp == NULL, ("already appending to args")); if (fname != NULL) { args->fname = args->buf; error = segflg == UIO_SYSSPACE ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); } else length = 0; /* Set up for _arg_*()/_env_*() */ args->endp = args->buf + length; /* begin_argv must be set and kept updated */ args->begin_argv = args->endp; KASSERT(exec_map_entry_size - length >= ARG_MAX, ("too little space remaining for arguments %zu < %zu", exec_map_entry_size - length, (size_t)ARG_MAX)); args->stringspace = ARG_MAX; return (0); } static int exec_args_add_str(struct image_args *args, const char *str, enum uio_seg segflg, int *countp) { int error; size_t length; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); error = (segflg == UIO_SYSSPACE) ? copystr(str, args->endp, args->stringspace, &length) : copyinstr(str, args->endp, args->stringspace, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); args->stringspace -= length; args->endp += length; (*countp)++; return (0); } int exec_args_add_arg(struct image_args *args, const char *argp, enum uio_seg segflg) { KASSERT(args->envc == 0, ("appending args after env")); return (exec_args_add_str(args, argp, segflg, &args->argc)); } int exec_args_add_env(struct image_args *args, const char *envp, enum uio_seg segflg) { if (args->envc == 0) args->begin_envv = args->endp; return (exec_args_add_str(args, envp, segflg, &args->envc)); } int exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend) { ssize_t offset; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); offset = extend - consume; if (args->stringspace < offset) return (E2BIG); memmove(args->begin_argv + extend, args->begin_argv + consume, args->endp - args->begin_argv + consume); if (args->envc > 0) args->begin_envv += offset; args->endp += offset; args->stringspace -= offset; return (0); } char * exec_args_get_begin_envv(struct image_args *args) { KASSERT(args->endp != NULL, ("endp not initialized")); if (args->envc > 0) return (args->begin_envv); return (args->endp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ int exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; struct proc *p; struct sysentvec *sysent; size_t execpath_len; int error, szsigcode; char canary[sizeof(long) * 8]; p = imgp->proc; sysent = p->p_sysent; destp = PROC_PS_STRINGS(p); arginfo = imgp->ps_strings = (void *)destp; /* * Install sigcode. */ if (sysent->sv_shared_page_base == 0 && sysent->sv_szsigcode != NULL) { szsigcode = *(sysent->sv_szsigcode); destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); error = copyout(sysent->sv_sigcode, (void *)destp, szsigcode); if (error != 0) return (error); } /* * Copy the image path for the rtld. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ imgp->pagesizeslen = sizeof(pagesizes[0]) * MAXPAGESIZES; destp -= imgp->pagesizeslen; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = (void *)destp; error = copyout(pagesizes, imgp->pagesizes, imgp->pagesizeslen); if (error != 0) return (error); /* * Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ destp -= AT_COUNT * sizeof(Elf_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* * Fill in "ps_strings" struct for ps, w, etc. */ imgp->argv = vectp; if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* a null vector table pointer separates the argp's from the envp's */ if (suword(vectp++, 0) != 0) return (EFAULT); imgp->envv = vectp; if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* end of vector table is a null pointer */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. * * Add a text reference now so no one can write to the * executable while we're activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ error = VOP_SET_TEXT(vp); if (error != 0) return (error); imgp->textset = true; /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = true; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *cp, char *base, char *buf, size_t len) { size_t chunk_len; int error; error = 0; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); /* * We can get EFAULT error here. * In that case zero out the current chunk of the segment. */ error = copyin(base, buf, chunk_len); if (error != 0) bzero(buf, chunk_len); error = compressor_write(cp->comp, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } int core_write(struct coredump_params *cp, const void *base, size_t len, off_t offset, enum uio_seg seg, size_t *resid) { return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base), len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, cp->active_cred, cp->file_cred, resid, cp->td)); } int core_output(char *base, size_t len, off_t offset, struct coredump_params *cp, void *tmpbuf) { vm_map_t map; struct mount *mp; size_t resid, runlen; int error; bool success; KASSERT((uintptr_t)base % PAGE_SIZE == 0, ("%s: user address %p is not page-aligned", __func__, base)); if (cp->comp != NULL) return (compress_chunk(cp, base, tmpbuf, len)); error = 0; map = &cp->td->td_proc->p_vmspace->vm_map; for (; len > 0; base += runlen, offset += runlen, len -= runlen) { /* * Attempt to page in all virtual pages in the range. If a * virtual page is not backed by the pager, it is represented as * a hole in the file. This can occur with zero-filled * anonymous memory or truncated files, for example. */ for (runlen = 0; runlen < len; runlen += PAGE_SIZE) { if (core_dump_can_intr && curproc_sigkilled()) return (EINTR); error = vm_fault(map, (uintptr_t)base + runlen, VM_PROT_READ, VM_FAULT_NOFILL, NULL); if (runlen == 0) success = error == KERN_SUCCESS; else if ((error == KERN_SUCCESS) != success) break; } if (success) { error = core_write(cp, base, runlen, offset, UIO_USERSPACE, &resid); if (error != 0) { if (error != EFAULT) break; /* * EFAULT may be returned if the user mapping * could not be accessed, e.g., because a mapped * file has been truncated. Skip the page if no * progress was made, to protect against a * hypothetical scenario where vm_fault() was * successful but core_write() returns EFAULT * anyway. */ runlen -= resid; if (runlen == 0) { success = false; runlen = PAGE_SIZE; } } } if (!success) { error = vn_start_write(cp->vp, &mp, V_WAIT); if (error != 0) break; vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_truncate_locked(cp->vp, offset + runlen, false, cp->td->td_ucred); VOP_UNLOCK(cp->vp); vn_finished_write(mp); if (error != 0) break; } } return (error); } /* * Drain into a core file. */ int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *cp; struct proc *p; int error, locked; cp = arg; p = cp->td->td_proc; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p); if (locked) PROC_UNLOCK(p); if (cp->comp != NULL) error = compressor_write(cp->comp, __DECONST(char *, data), len); else error = core_write(cp, __DECONST(void *, data), len, cp->offset, UIO_SYSSPACE, NULL); if (locked) PROC_LOCK(p); if (error != 0) return (-error); cp->offset += len; return (len); } diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c index a95780aa2f27..177069f5e010 100644 --- a/sys/netinet/ip_reass.c +++ b/sys/netinet/ip_reass.c @@ -1,979 +1,984 @@ /*- * Copyright (c) 2015 Gleb Smirnoff * Copyright (c) 2015 Adrian Chadd * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ #define IPREASS_NHASH_LOG2 10 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (V_ipq_hashsize - 1) struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; struct callout timer; #ifdef VIMAGE struct vnet *vnet; #endif int count; }; VNET_DEFINE_STATIC(struct ipqbucket *, ipq); #define V_ipq VNET(ipq) VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); #define V_ipq_hashseed VNET(ipq_hashseed) VNET_DEFINE_STATIC(uint32_t, ipq_hashsize); #define V_ipq_hashsize VNET(ipq_hashsize) #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) void ipreass_init(void); void ipreass_vnet_init(void); #ifdef VIMAGE void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static void ipreass_callout(void *); static void ipreass_reschedule(struct ipqbucket *); static inline void ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); ipq_free(bucket, fp); } static inline void ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ipq_free(bucket, fp); ipreass_reschedule(bucket); } /* * By default, limit the number of IP fragments across all reassembly * queues to 1/32 of the total number of mbuf clusters. * * Limit the total number of reassembly queues per VNET to the * IP fragment limit, but ensure the limit will not allow any bucket * to grow above 100 items. (The bucket limit is * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct * multiplier to reach a 100-item limit.) * The 100-item limit was chosen as brief testing seems to show that * this produces "reasonable" performance on some subset of systems * under DoS attack. */ #define IP_MAXFRAGS (nmbclusters / 32) #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50)) static int maxfrags; static u_int __exclusive_cache_line nfrags; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, &maxfrags, 0, "Maximum number of IPv4 fragments allowed across all reassembly queues"); SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, &nfrags, 0, "Current number of IPv4 fragments across all reassembly queues"); VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #define V_ipq_zone VNET(ipq_zone) SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0, "Size of IP fragment reassembly hashtable"); SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_maxfragpackets, "I", "Maximum number of IPv4 fragment reassembly queue entries"); SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); VNET_DEFINE_STATIC(int, noreass); #define V_noreass VNET(noreass) VNET_DEFINE_STATIC(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxfragbucketsize, "I", "Maximum number of IPv4 fragment reassembly queue entries per bucket"); VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; #define V_ipfragttl VNET(ipfragttl) SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", "IP fragment life time on reassembly queue (seconds)"); /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ #define M_IP_FRAG M_PROTO9 struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ifnet *srcifp; struct ipqhead *head; int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. * Also, drop packet if it would exceed the maximum * number of fragments. */ tmpmax = maxfrags; if (V_noreass == 1 || V_maxfragsperpacket == 0 || (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes, unless * this is the last fragment. */ if (ip->ip_len == htons(0) || ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { IPSTAT_INC(ips_toosmall); /* XXX */ IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } if (ip->ip_off & htons(IP_MF)) m->m_flags |= M_IP_FRAG; else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); /* * Make sure the fragment lies within a packet of valid size. */ if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } /* * Store receive network interface pointer for later. */ srcifp = m->m_pkthdr.rcvif; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ IPSTAT_INC(ips_fragments); m->m_pkthdr.PH_loc.ptr = ip; /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; hashkey[0] = ip->ip_src.s_addr; hashkey[1] = ip->ip_dst.s_addr; hashkey[2] = (uint32_t)ip->ip_p << 16; hashkey[2] += ip->ip_id; hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); hash &= IPREASS_HMASK; head = &V_ipq[hash].head; IPQ_LOCK(hash); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) break; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { if (V_ipq[hash].count < V_ipreass_maxbucketsize) fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(&nfrags, 1); fp->ipq_expire = time_uptime + V_ipfragttl; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; if (m->m_flags & M_IP_FRAG) fp->ipq_maxoff = -1; else fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); m->m_nextpkt = NULL; if (fp == TAILQ_LAST(head, ipqhead)) callout_reset_sbt(&V_ipq[hash].timer, SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, &V_ipq[hash], 0); else MPASS(callout_active(&V_ipq[hash].timer)); goto done; } else { /* * If we already saw the last fragment, make sure * this fragment's offset looks sane. Otherwise, if * this is the last fragment, record its endpoint. */ if (fp->ipq_maxoff > 0) { i = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || ((m->m_flags & M_IP_FRAG) == 0 && i != fp->ipq_maxoff)) { fp = NULL; goto dropfrag; } } else if ((m->m_flags & M_IP_FRAG) == 0) fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); fp->ipq_nfrags++; atomic_add_int(&nfrags, 1); #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the preceding * segment, then it's checksum is invalidated. */ if (p) { i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > ntohs(GETIP(q)->ip_off); q = nq) { i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - ntohs(GETIP(q)->ip_off); if (i < ntohs(GETIP(q)->ip_len)) { GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); ipq_drop(&V_ipq[hash], fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_demote_pkthdr(q); m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); atomic_subtract_int(&nfrags, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ m_fixhdr(m); /* set valid receive interface pointer */ m->m_pkthdr.rcvif = srcifp; } IPSTAT_INC(ips_reassembled); ipreass_reschedule(&V_ipq[hash]); IPQ_UNLOCK(hash); #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the * mbuf payload. * * For now, just assume we have to calculate a new one. * Later on we should check to see if the assigned flowid matches * what RSS wants for the given IP protocol and if so, just keep it. * * We then queue into the relevant netisr so it can be dispatched * to the correct CPU. * * Note - this may return 1, which means the flowid in the mbuf * is correct for the configured RSS hash types and can be used. */ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { m->m_pkthdr.flowid = rss_hash; M_HASHTYPE_SET(m, rss_type); } /* * Queue/dispatch for reprocessing. * * Note: this is much slower than just handling the frame in the * current receive context. It's likely worth investigating * why this is. */ netisr_dispatch(NETISR_IP_DIRECT, m); return (NULL); #endif /* Handle in-line */ return (m); dropfrag: IPSTAT_INC(ips_fragdropped); if (fp != NULL) { fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); } m_freem(m); done: IPQ_UNLOCK(hash); return (NULL); #undef GETIP } /* * Timer expired on a bucket. * There should be at least one ipq to be timed out. */ static void ipreass_callout(void *arg) { struct ipqbucket *bucket = arg; struct ipq *fp; IPQ_BUCKET_LOCK_ASSERT(bucket); MPASS(atomic_load_int(&nfrags) > 0); CURVNET_SET(bucket->vnet); fp = TAILQ_LAST(&bucket->head, ipqhead); KASSERT(fp != NULL && fp->ipq_expire <= time_uptime, ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket, fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime)); while (fp != NULL && fp->ipq_expire <= time_uptime) { ipq_timeout(bucket, fp); fp = TAILQ_LAST(&bucket->head, ipqhead); } ipreass_reschedule(bucket); CURVNET_RESTORE(); } static void ipreass_reschedule(struct ipqbucket *bucket) { struct ipq *fp; IPQ_BUCKET_LOCK_ASSERT(bucket); if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { time_t t; /* Protect against time_uptime tick. */ t = fp->ipq_expire - time_uptime; t = (t > 0) ? t : 1; callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, ipreass_callout, bucket, 0); } else callout_stop(&bucket->timer); } static void ipreass_drain_vnet(void) { u_int dropped = 0; for (int i = 0; i < V_ipq_hashsize; i++) { bool resched; IPQ_LOCK(i); resched = !TAILQ_EMPTY(&V_ipq[i].head); while(!TAILQ_EMPTY(&V_ipq[i].head)) { struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head); dropped += fp->ipq_nfrags; ipq_free(&V_ipq[i], fp); } if (resched) ipreass_reschedule(&V_ipq[i]); KASSERT(V_ipq[i].count == 0, ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, V_ipq[i].count, V_ipq)); IPQ_UNLOCK(i); } IPSTAT_ADD(ips_fragdropped, dropped); } /* * Drain off all datagram fragments. */ static void ipreass_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } +static void +ipreass_drain_lowmem(void *arg __unused, int flags __unused) +{ + ipreass_drain(); +} /* * Initialize IP reassembly structures. */ MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers"); void ipreass_vnet_init(void) { int max; V_ipq_hashsize = IPREASS_NHASH; TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize); V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize, M_IPREASS_HASH, M_WAITOK); for (int i = 0; i < V_ipq_hashsize; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW); callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); V_ipq[i].count = 0; #ifdef VIMAGE V_ipq[i].vnet = curvnet; #endif } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); max = IP_MAXFRAGPACKETS; max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); } void ipreass_init(void) { maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); - EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, + EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain_lowmem, NULL, + LOWMEM_PRI_DEFAULT); + EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain_lowmem, NULL, LOWMEM_PRI_DEFAULT); - EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, - LOWMEM_PRI_DEFAULT); } /* * Drain off all datagram fragments belonging to * the given network interface. */ static void ipreass_cleanup(void *arg __unused, struct ifnet *ifp) { struct ipq *fp, *temp; struct mbuf *m; int i; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); CURVNET_SET_QUIET(ifp->if_vnet); /* * Skip processing if IPv4 reassembly is not initialised or * torn down by ipreass_destroy(). */ if (V_ipq_zone == NULL) { CURVNET_RESTORE(); return; } for (i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); /* Scan fragment list. */ TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { /* clear no longer valid rcvif pointer */ if (m->m_pkthdr.rcvif == ifp) m->m_pkthdr.rcvif = NULL; } } IPQ_UNLOCK(i); } CURVNET_RESTORE(); } EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); #ifdef VIMAGE /* * Destroy IP reassembly structures. */ void ipreass_destroy(void) { ipreass_drain_vnet(); uma_zdestroy(V_ipq_zone); V_ipq_zone = NULL; for (int i = 0; i < V_ipq_hashsize; i++) mtx_destroy(&V_ipq[i].lock); free(V_ipq, M_IPREASS_HASH); } #endif /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void ipreass_drain_tomax(void) { struct ipq *fp; int target; /* * Make sure each bucket is under the new limit. If * necessary, drop enough of the oldest elements from * each bucket to get under the new limit. */ for (int i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); while (V_ipq[i].count > V_ipreass_maxbucketsize && (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) ipq_timeout(&V_ipq[i], fp); ipreass_reschedule(&V_ipq[i]); IPQ_UNLOCK(i); } /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every * run we strip the oldest element from each bucket. */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { for (int i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) { ipq_timeout(&V_ipq[i], fp); ipreass_reschedule(&V_ipq[i]); } IPQ_UNLOCK(i); } } } static void ipreass_zone_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); int max; maxfrags = IP_MAXFRAGS; max = IP_MAXFRAGPACKETS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); ipreass_drain_tomax(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Change the limit on the UMA zone, or disable the fragment allocation * at all. Since 0 and -1 is a special values here, we need our own handler, * instead of sysctl_handle_uma_zone_max(). */ static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) { int error, max; if (V_noreass == 0) { max = uma_zone_get_max(V_ipq_zone); if (max == 0) max = -1; } else max = 0; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max > 0) { /* * XXXRW: Might be a good idea to sanity check the argument * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { V_noreass = 1; ipreass_drain(); } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); } /* * Seek for old fragment queue header that can be reused. Try to * reuse a header from currently locked hash bucket. */ static struct ipq * ipq_reuse(int start) { struct ipq *fp; int bucket, i; IPQ_LOCK_ASSERT(start); for (i = 0; i < V_ipq_hashsize; i++) { bucket = (start + i) % V_ipq_hashsize; if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { m = fp->ipq_frags; fp->ipq_frags = m->m_nextpkt; m_freem(m); } TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); V_ipq[bucket].count--; ipreass_reschedule(&V_ipq[bucket]); if (bucket != start) IPQ_UNLOCK(bucket); break; } if (bucket != start) IPQ_UNLOCK(bucket); } IPQ_LOCK_ASSERT(start); return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(&bucket->head, fp, ipq_list); bucket->count--; uma_zfree(V_ipq_zone, fp); } /* * Get or set the maximum number of reassembly queues per bucket. */ static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) { int error, max; max = V_ipreass_maxbucketsize; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max <= 0) return (EINVAL); V_ipreass_maxbucketsize = max; ipreass_drain_tomax(); return (0); } /* * Get or set the IP fragment time to live. */ static int sysctl_fragttl(SYSCTL_HANDLER_ARGS) { u_int ttl; int error; ttl = V_ipfragttl; error = sysctl_handle_int(oidp, &ttl, 0, req); if (error || !req->newptr) return (error); if (ttl < 1 || ttl > MAXTTL) return (EINVAL); atomic_store_int(&V_ipfragttl, ttl); return (0); } diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index 185530e3bc5e..55b452743f86 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -1,6973 +1,6973 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #ifdef INET6 #include #endif #include #include #include /* FIX: we don't handle multiple link local scopes */ /* "scopeless" replacement IN6_ARE_ADDR_EQUAL */ #ifdef INET6 int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b) { struct sockaddr_in6 tmp_a, tmp_b; memcpy(&tmp_a, a, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } memcpy(&tmp_b, b, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr)); } #endif void sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb) { /* * We really don't need to lock this, but I will just because it * does not hurt. */ SCTP_INP_INFO_RLOCK(); spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep); spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc); spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr); spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr); spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk); spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq); spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq); spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks); SCTP_INP_INFO_RUNLOCK(); } /*- * Addresses are added to VRF's (Virtual Router's). For BSD we * have only the default VRF 0. We maintain a hash list of * VRF's. Each VRF has its own list of sctp_ifn's. Each of * these has a list of addresses. When we add a new address * to a VRF we lookup the ifn/ifn_index, if the ifn does * not exist we create it and add it to the list of IFN's * within the VRF. Once we have the sctp_ifn, we add the * address to the list. So we look something like: * * hash-vrf-table * vrf-> ifn-> ifn -> ifn * vrf | * ... +--ifa-> ifa -> ifa * vrf * * We keep these separate lists since the SCTP subsystem will * point to these from its source address selection nets structure. * When an address is deleted it does not happen right away on * the SCTP side, it gets scheduled. What we do when a * delete happens is immediately remove the address from * the master list and decrement the refcount. As our * addip iterator works through and frees the src address * selection pointing to the sctp_ifa, eventually the refcount * will reach 0 and we will delete it. Note that it is assumed * that any locking on system level ifn/ifa is done at the * caller of these functions and these routines will only * lock the SCTP structures as they add or delete things. * * Other notes on VRF concepts. * - An endpoint can be in multiple VRF's * - An association lives within a VRF and only one VRF. * - Any incoming packet we can deduce the VRF for by * looking at the mbuf/pak inbound (for BSD its VRF=0 :D) * - Any downward send call or connect call must supply the * VRF via ancillary data or via some sort of set default * VRF socket option call (again for BSD no brainer since * the VRF is always 0). * - An endpoint may add multiple VRF's to it. * - Listening sockets can accept associations in any * of the VRF's they are in but the assoc will end up * in only one VRF (gotten from the packet or connect/send). * */ struct sctp_vrf * sctp_allocate_vrf(int vrf_id) { struct sctp_vrf *vrf = NULL; struct sctp_vrflist *bucket; /* First allocate the VRF structure */ vrf = sctp_find_vrf(vrf_id); if (vrf) { /* Already allocated */ return (vrf); } SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf), SCTP_M_VRF); if (vrf == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif return (NULL); } /* setup the VRF */ memset(vrf, 0, sizeof(struct sctp_vrf)); vrf->vrf_id = vrf_id; LIST_INIT(&vrf->ifnlist); vrf->total_ifa_count = 0; vrf->refcount = 0; /* now also setup table ids */ SCTP_INIT_VRF_TABLEID(vrf); /* Init the HASH of addresses */ vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE, &vrf->vrf_addr_hashmark); if (vrf->vrf_addr_hash == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif SCTP_FREE(vrf, SCTP_M_VRF); return (NULL); } /* Add it to the hash table */ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_INSERT_HEAD(bucket, vrf, next_vrf); atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); return (vrf); } struct sctp_ifn * sctp_find_ifn(void *ifn, uint32_t ifn_index) { struct sctp_ifn *sctp_ifnp; struct sctp_ifnlist *hash_ifn_head; SCTP_IPI_ADDR_LOCK_ASSERT(); KASSERT(ifn != NULL, ("sctp_find_ifn(NULL, %u) called", ifn_index)); hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) { if (sctp_ifnp->ifn_index == ifn_index && sctp_ifnp->ifn_p == ifn) { break; } } return (sctp_ifnp); } struct sctp_vrf * sctp_find_vrf(uint32_t vrf_id) { struct sctp_vrflist *bucket; struct sctp_vrf *liste; bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH(liste, bucket, next_vrf) { if (vrf_id == liste->vrf_id) { return (liste); } } return (NULL); } void sctp_free_vrf(struct sctp_vrf *vrf) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) { if (vrf->vrf_addr_hash) { SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); vrf->vrf_addr_hash = NULL; } /* We zero'd the count */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); } } static void sctp_free_ifn(struct sctp_ifn *sctp_ifnp) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) { /* We zero'd the count */ if (sctp_ifnp->vrf) { sctp_free_vrf(sctp_ifnp->vrf); } SCTP_FREE(sctp_ifnp, SCTP_M_IFN); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); } } void sctp_free_ifa(struct sctp_ifa *sctp_ifap) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) { /* We zero'd the count */ if (sctp_ifap->ifn_p) { sctp_free_ifn(sctp_ifap->ifn_p); } SCTP_FREE(sctp_ifap, SCTP_M_IFA); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); } } static void sctp_delete_ifn(struct sctp_ifn *sctp_ifnp) { SCTP_IPI_ADDR_WLOCK_ASSERT(); if (sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index) == NULL) { /* Not in the list.. sorry */ return; } LIST_REMOVE(sctp_ifnp, next_bucket); LIST_REMOVE(sctp_ifnp, next_ifn); /* Take away the reference, and possibly free it */ sctp_free_ifn(sctp_ifnp); } /*- * Add an ifa to an ifn. * Register the interface as necessary. */ static void sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) { int ifa_af; SCTP_IPI_ADDR_WLOCK_ASSERT(); LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); /* update address counts */ sctp_ifnp->ifa_count++; ifa_af = sctp_ifap->address.sa.sa_family; switch (ifa_af) { #ifdef INET case AF_INET: sctp_ifnp->num_v4++; break; #endif #ifdef INET6 case AF_INET6: sctp_ifnp->num_v6++; break; #endif default: break; } if (sctp_ifnp->ifa_count == 1) { /* register the new interface */ sctp_ifnp->registered_af = ifa_af; } } /*- * Remove an ifa from its ifn. * If no more addresses exist, remove the ifn too. Otherwise, re-register * the interface based on the remaining address families left. */ static void sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) { SCTP_IPI_ADDR_WLOCK_ASSERT(); LIST_REMOVE(sctp_ifap, next_ifa); if (sctp_ifap->ifn_p) { /* update address counts */ sctp_ifap->ifn_p->ifa_count--; switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: sctp_ifap->ifn_p->num_v4--; break; #endif #ifdef INET6 case AF_INET6: sctp_ifap->ifn_p->num_v6--; break; #endif default: break; } if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) { /* remove the ifn, possibly freeing it */ sctp_delete_ifn(sctp_ifap->ifn_p); } else { /* re-register address family type, if needed */ if ((sctp_ifap->ifn_p->num_v6 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET6)) { sctp_ifap->ifn_p->registered_af = AF_INET; } else if ((sctp_ifap->ifn_p->num_v4 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET)) { sctp_ifap->ifn_p->registered_af = AF_INET6; } /* free the ifn refcount */ sctp_free_ifn(sctp_ifap->ifn_p); } sctp_ifap->ifn_p = NULL; } } struct sctp_ifa * sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, uint32_t ifn_type, const char *if_name, void *ifa, struct sockaddr *addr, uint32_t ifa_flags, int dynamic_add) { struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifnp, *new_sctp_ifnp; struct sctp_ifa *sctp_ifap, *new_sctp_ifap; struct sctp_ifalist *hash_addr_head; struct sctp_ifnlist *hash_ifn_head; uint32_t hash_of_addr; #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif SCTP_MALLOC(new_sctp_ifnp, struct sctp_ifn *, sizeof(struct sctp_ifn), SCTP_M_IFN); if (new_sctp_ifnp == NULL) { #ifdef INVARIANTS panic("No memory for IFN"); #endif return (NULL); } SCTP_MALLOC(new_sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA); if (new_sctp_ifap == NULL) { #ifdef INVARIANTS panic("No memory for IFA"); #endif SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); return (NULL); } SCTP_IPI_ADDR_WLOCK(); sctp_ifnp = sctp_find_ifn(ifn, ifn_index); if (sctp_ifnp) { vrf = sctp_ifnp->vrf; } else { vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { vrf = sctp_allocate_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_WUNLOCK(); SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (NULL); } } } if (sctp_ifnp == NULL) { /* * build one and add it, can't hold lock until after malloc * done though. */ sctp_ifnp = new_sctp_ifnp; new_sctp_ifnp = NULL; memset(sctp_ifnp, 0, sizeof(struct sctp_ifn)); sctp_ifnp->ifn_index = ifn_index; sctp_ifnp->ifn_p = ifn; sctp_ifnp->ifn_type = ifn_type; sctp_ifnp->refcount = 0; sctp_ifnp->vrf = vrf; atomic_add_int(&vrf->refcount, 1); sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index); if (if_name != NULL) { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", if_name); } else { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", "unknown"); } hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_INIT(&sctp_ifnp->ifalist); LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket); LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn); atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap != NULL) { /* The address being added is already or still known. */ if (sctp_ifap->ifn_p != NULL) { if (sctp_ifap->ifn_p->ifn_index == ifn_index && sctp_ifap->ifn_p->ifn_p == ifn) { SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n", sctp_ifap->ifn_p->ifn_name, ifn_index, (void *)sctp_ifap); if (new_sctp_ifnp == NULL) { /* Remove the created one not used. */ sctp_delete_ifn(sctp_ifnp); } if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { /* Switch back to active. */ SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); sctp_ifap->localifa_flags = SCTP_ADDR_VALID; sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); } } else { /* * The last IFN gets the address, remove the * old one. */ SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name, sctp_ifap->ifn_p->ifn_index, if_name, ifn_index); /* remove the address from the old ifn */ sctp_remove_ifa_from_ifn(sctp_ifap); /* move the address over to the new ifn */ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); } } else { /* Repair ifn_p, which was NULL... */ sctp_ifap->localifa_flags = SCTP_ADDR_VALID; SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", (void *)sctp_ifnp, (void *)sctp_ifap); sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); } SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (sctp_ifap); } KASSERT(sctp_ifnp != NULL, ("sctp_add_addr_to_vrf: sctp_ifnp == NULL")); KASSERT(sctp_ifap == NULL, ("sctp_add_addr_to_vrf: sctp_ifap (%p) != NULL", sctp_ifap)); sctp_ifap = new_sctp_ifap; memset(sctp_ifap, 0, sizeof(struct sctp_ifa)); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifnp->refcount, 1); sctp_ifap->vrf_id = vrf_id; sctp_ifap->ifa = ifa; memcpy(&sctp_ifap->address, addr, addr->sa_len); sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; sctp_ifap->flags = ifa_flags; /* Set scope */ switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = &sctp_ifap->address.sin; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_loop = 1; } if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v4++; if (new_sctp_ifnp == NULL) sctp_ifnp->registered_af = AF_INET; break; } #endif #ifdef INET6 case AF_INET6: { /* ok to use deprecated addresses? */ struct sockaddr_in6 *sin6; sin6 = &sctp_ifap->address.sin6; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) { sctp_ifap->src_is_loop = 1; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v6++; if (new_sctp_ifnp == NULL) sctp_ifnp->registered_af = AF_INET6; break; } #endif default: break; } hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa); if ((sctp_ifap->src_is_priv == 0) && (sctp_ifap->src_is_loop == 0)) { sctp_ifap->src_is_glob = 1; } hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket); sctp_ifap->refcount = 1; LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifnp->ifa_count++; vrf->total_ifa_count++; atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } if (dynamic_add) { /* * Bump up the refcount so that when the timer completes it * will drop back down. */ struct sctp_laddr *wi; atomic_add_int(&sctp_ifap->refcount, 1); wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Opps, must decrement the count */ sctp_del_addr_from_vrf(vrf_id, addr, ifn, ifn_index); return (NULL); } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_ADD_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } else { /* it's ready for use */ sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } return (sctp_ifap); } void sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, void *ifn, uint32_t ifn_index) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_WLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); SCTP_IPI_ADDR_WUNLOCK(); return; } #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap != NULL) { /* Validate the delete */ if (sctp_ifap->ifn_p) { if (ifn_index != sctp_ifap->ifn_p->ifn_index || ifn != sctp_ifap->ifn_p->ifn_p) { SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d (%p) ifname:%s - ignoring delete\n", sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_p, sctp_ifap->ifn_p->ifn_name); SCTP_IPI_ADDR_WUNLOCK(); return; } } SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap); sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; /* * We don't set the flag. This means that the structure will * hang around in EP's that have bound specific to it until * they close. This gives us TCP like behavior if someone * removes an address (or for that matter adds it right * back). */ /* sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; */ vrf->total_ifa_count--; LIST_REMOVE(sctp_ifap, next_bucket); sctp_remove_ifa_from_ifn(sctp_ifap); } #ifdef SCTP_DEBUG else { SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:", ifn_index); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); } #endif SCTP_IPI_ADDR_WUNLOCK(); if (sctp_ifap != NULL) { struct sctp_laddr *wi; wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Oops, must decrement the count */ sctp_free_ifa(sctp_ifap); return; } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_DEL_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); /* * Should this really be a tailq? As it is we will process * the newest first :-0 */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } return; } static int sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) { int loopback_scope; #if defined(INET) int ipv4_local_scope, ipv4_addr_legal; #endif #if defined(INET6) int local_scope, site_scope, ipv6_addr_legal; #endif struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; loopback_scope = stcb->asoc.scope.loopback_scope; #if defined(INET) ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; #endif #if defined(INET6) local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; #endif SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(stcb->asoc.vrf_id); if (vrf == NULL) { /* no vrf, no addresses */ SCTP_IPI_ADDR_RUNLOCK(); return (0); } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (sctp_is_addr_restricted(stcb, sctp_ifa) && (!sctp_is_addr_pending(stcb, sctp_ifa))) { /* * We allow pending addresses, where * we have sent an asconf-add to be * considered valid. */ continue; } if (sctp_ifa->address.sa.sa_family != to->sa_family) { continue; } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin, *rsin; sin = &sctp_ifa->address.sin; rsin = (struct sockaddr_in *)to; if ((ipv4_local_scope == 0) && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { continue; } if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6, *rsin6; sin6 = &sctp_ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; if (sin6->sin6_scope_id == 0) { if (sa6_recoverscope(sin6) != 0) continue; } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif default: /* TSNH */ break; } } } } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (sctp_is_addr_restricted(stcb, laddr->ifa) && (!sctp_is_addr_pending(stcb, laddr->ifa))) { /* * We allow pending addresses, where we have * sent an asconf-add to be considered * valid. */ continue; } if (laddr->ifa->address.sa.sa_family != to->sa_family) { continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = &laddr->ifa->address.sin; rsin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = &laddr->ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif default: /* TSNH */ break; } } } SCTP_IPI_ADDR_RUNLOCK(); return (0); } static struct sctp_tcb * sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id) { /**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */ /* * If we support the TCP model, then we must now dig through to see * if we can find our endpoint in the list of tcp ep's. */ uint16_t lport, rport; struct sctppcbhead *ephead; struct sctp_inpcb *inp; struct sctp_laddr *laddr; struct sctp_tcb *stcb; struct sctp_nets *net; if ((to == NULL) || (from == NULL)) { return (NULL); } switch (to->sa_family) { #ifdef INET case AF_INET: if (from->sa_family == AF_INET) { lport = ((struct sockaddr_in *)to)->sin_port; rport = ((struct sockaddr_in *)from)->sin_port; } else { return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (from->sa_family == AF_INET6) { lport = ((struct sockaddr_in6 *)to)->sin6_port; rport = ((struct sockaddr_in6 *)from)->sin6_port; } else { return (NULL); } break; #endif default: return (NULL); } ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; /* * Ok now for each of the guys in this bucket we must look and see: * - Does the remote port match. - Does there single association's * addresses match this address (to). If so we update p_ep to point * to this ep and return the tcb from it. */ LIST_FOREACH(inp, ephead, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if (lport != inp->sctp_lport) { SCTP_INP_RUNLOCK(inp); continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)to; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif default: SCTP_INP_RUNLOCK(inp); continue; } if (inp->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(inp); continue; } /* check to see if the ep has one of the addresses */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* We are NOT bound all, so look further */ int match = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == to->sa_family) { /* see if it matches */ #ifdef INET if (from->sa_family == AF_INET) { struct sockaddr_in *intf_addr, *sin; intf_addr = &laddr->ifa->address.sin; sin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == intf_addr->sin_addr.s_addr) { match = 1; break; } } #endif #ifdef INET6 if (from->sa_family == AF_INET6) { struct sockaddr_in6 *intf_addr6; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *) to; intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { match = 1; break; } } #endif } } if (match == 0) { /* This endpoint does not have this address */ SCTP_INP_RUNLOCK(inp); continue; } } /* * Ok if we hit here the ep has the address, does it hold * the tcb? */ /* XXX: Why don't we TAILQ_FOREACH through sctp_asoc_list? */ stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); continue; } SCTP_TCB_LOCK(stcb); if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } /* Does this TCB have a matching address? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._l_addr.sa.sa_family != from->sa_family) { /* not the same family, can't be a match */ continue; } switch (from->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *)&net->ro._l_addr; rsin = (struct sockaddr_in *)from; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)from; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); } return (NULL); } /* * rules for use * * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an * stcb, both will be locked (locked_tcb and stcb) but decrement will be done * (if locked == NULL). 3) Decrement happens on return ONLY if locked == * NULL. */ struct sctp_tcb * sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb) { struct sctpasochead *head; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; struct sctp_nets *net; uint16_t rport; inp = *inp_p; switch (remote->sa_family) { #ifdef INET case AF_INET: rport = (((struct sockaddr_in *)remote)->sin_port); break; #endif #ifdef INET6 case AF_INET6: rport = (((struct sockaddr_in6 *)remote)->sin6_port); break; #endif default: return (NULL); } if (locked_tcb) { /* * UN-lock so we can do proper locking here this occurs when * called from load_addresses_from_init. */ atomic_add_int(&locked_tcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(locked_tcb); } SCTP_INP_INFO_RLOCK(); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /*- * Now either this guy is our listener or it's the * connector. If it is the one that issued the connect, then * it's only chance is to be the first TCB in the list. If * it is the acceptor, then do the special_lookup to hash * and find the real inp. */ if ((inp->sctp_socket) && SCTP_IS_LISTENING(inp)) { /* to is peer addr, from is my addr */ stcb = sctp_tcb_special_locate(inp_p, remote, local, netp, inp->def_vrf_id); if ((stcb != NULL) && (locked_tcb == NULL)) { /* we have a locked tcb, lower refcount */ SCTP_INP_DECR_REF(inp); } if ((locked_tcb != NULL) && (locked_tcb != stcb)) { SCTP_INP_RLOCK(locked_tcb->sctp_ep); SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); SCTP_INP_RUNLOCK(locked_tcb->sctp_ep); } SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { goto null_return; } SCTP_TCB_LOCK(stcb); if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); goto null_return; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); goto null_return; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); goto null_return; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport, inp->sctp_hashmark)]; LIST_FOREACH(stcb, head, sctp_tcbhash) { if (stcb->rport != rport) { /* remote port does not match */ continue; } SCTP_TCB_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); continue; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *) &net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } null_return: /* clean up for returning null */ if (locked_tcb) { SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); /* not found */ return (NULL); } /* * Find an association for a specific endpoint using the association id given * out in the COMM_UP notification */ struct sctp_tcb * sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { /* * Use my the assoc_id to find a endpoint */ struct sctpasochead *head; struct sctp_tcb *stcb; uint32_t id; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_PRINTF("TSNH ep_associd0\n"); return (NULL); } id = (uint32_t)asoc_id; head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; if (head == NULL) { /* invalid id TSNH */ SCTP_PRINTF("TSNH ep_associd1\n"); return (NULL); } LIST_FOREACH(stcb, head, sctp_tcbasocidhash) { if (stcb->asoc.assoc_id == id) { if (inp != stcb->sctp_ep) { /* * some other guy has the same id active (id * collision ??). */ SCTP_PRINTF("TSNH ep_associd2\n"); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { continue; } if (want_lock) { SCTP_TCB_LOCK(stcb); } return (stcb); } } return (NULL); } struct sctp_tcb * sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { struct sctp_tcb *stcb; SCTP_INP_RLOCK(inp); stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock); SCTP_INP_RUNLOCK(inp); return (stcb); } /* * Endpoint probe expects that the INP_INFO is locked. */ static struct sctp_inpcb * sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head, uint16_t lport, uint32_t vrf_id) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sockaddr_in6 *intf_addr6; #endif int fnd; #ifdef INET sin = NULL; #endif #ifdef INET6 sin6 = NULL; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; break; #endif default: /* unsupported family */ return (NULL); } if (head == NULL) return (NULL); LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) && (inp->sctp_lport == lport)) { /* got it */ switch (nam->sa_family) { #ifdef INET case AF_INET: if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* * IPv4 on a IPv6 socket with ONLY * IPv6 set */ SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif #ifdef INET6 case AF_INET6: /* * A V6 address and the endpoint is NOT * bound V6 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif default: break; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; SCTP_INP_RUNLOCK(inp); if (!fnd) continue; return (inp); } SCTP_INP_RUNLOCK(inp); } switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == INADDR_ANY) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif default: break; } /* * ok, not bound to all so see if we can find a EP bound to this * address. */ LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) { SCTP_INP_RUNLOCK(inp); continue; } /* * Ok this could be a likely candidate, look at all of its * addresses */ if (inp->sctp_lport != lport) { SCTP_INP_RUNLOCK(inp); continue; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) { SCTP_INP_RUNLOCK(inp); continue; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ", (void *)laddr->ifa); if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == nam->sa_family) { /* possible, see if it matches */ switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == laddr->ifa->address.sin.sin_addr.s_addr) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif #ifdef INET6 case AF_INET6: intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif } } } SCTP_INP_RUNLOCK(inp); } return (NULL); } static struct sctp_inpcb * sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id) { struct sctppcbhead *head; struct sctp_inpcb *t_inp; int fnd; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; LIST_FOREACH(t_inp, head, sctp_hash) { if (t_inp->sctp_lport != lport) { continue; } /* is it in the VRF in question */ fnd = 0; if (t_inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) continue; /* This one is in use. */ /* check the v6/v4 binding issue */ if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(t_inp)) { if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* collision in V6 space */ return (t_inp); } else { /* inp is BOUND_V4 no conflict */ continue; } } else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* t_inp is bound v4 and v6, conflict always */ return (t_inp); } else { /* t_inp is bound only V4 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* no conflict */ continue; } /* else fall through to conflict */ } return (t_inp); } return (NULL); } int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) { /* For 1-2-1 with port reuse */ struct sctppcbhead *head; struct sctp_inpcb *tinp, *ninp; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* only works with port reuse on */ return (-1); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) { return (0); } SCTP_INP_WUNLOCK(inp); head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; /* Kick out all non-listeners to the TCP hash */ LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) { if (tinp->sctp_lport != inp->sctp_lport) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { continue; } if (SCTP_IS_LISTENING(tinp)) { continue; } SCTP_INP_WLOCK(tinp); LIST_REMOVE(tinp, sctp_hash); head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))]; tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; LIST_INSERT_HEAD(head, tinp, sctp_hash); SCTP_INP_WUNLOCK(tinp); } SCTP_INP_WLOCK(inp); /* Pull from where he was */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; LIST_INSERT_HEAD(head, inp, sctp_hash); return (0); } struct sctp_inpcb * sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock, uint32_t vrf_id) { /* * First we check the hash table to see if someone has this port * bound with just the port. */ struct sctp_inpcb *inp; struct sctppcbhead *head; int lport; unsigned int i; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; lport = sin->sin_port; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; lport = sin6->sin6_port; break; #endif default: return (NULL); } /* * I could cheat here and just cast to one of the types but we will * do it right. It also provides the check against an Unsupported * type too. */ /* Find the head of the ALLADDR chain */ if (have_lock == 0) { SCTP_INP_INFO_RLOCK(); } head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); /* * If the TCP model exists it could be that the main listening * endpoint is gone but there still exists a connected socket for * this guy. If so we can return the first one that we find. This * may NOT be the correct one so the caller should be wary on the * returned INP. Currently the only caller that sets find_tcp_pool * is in bindx where we are verifying that a user CAN bind the * address. He either has bound it already, or someone else has, or * its open to bind, so this is good enough. */ if (inp == NULL && find_tcp_pool) { for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) { head = &SCTP_BASE_INFO(sctp_tcpephash)[i]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); if (inp) { break; } } } if (inp) { SCTP_INP_INCR_REF(inp); } if (have_lock == 0) { SCTP_INP_INFO_RUNLOCK(); } return (inp); } /* * Find an association for an endpoint with the pointer to whom you want to * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may * need to change the *to to some other struct like a mbuf... */ struct sctp_tcb * sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to, struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool, uint32_t vrf_id) { struct sctp_inpcb *inp = NULL; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); if (find_tcp_pool) { if (inp_p != NULL) { stcb = sctp_tcb_special_locate(inp_p, from, to, netp, vrf_id); } else { stcb = sctp_tcb_special_locate(&inp, from, to, netp, vrf_id); } if (stcb != NULL) { SCTP_INP_INFO_RUNLOCK(); return (stcb); } } inp = sctp_pcb_findep(to, 0, 1, vrf_id); if (inp_p != NULL) { *inp_p = inp; } SCTP_INP_INFO_RUNLOCK(); if (inp == NULL) { return (NULL); } /* * ok, we have an endpoint, now lets find the assoc for it (if any) * we now place the source address or from in the to of the find * endpoint call. Since in reality this chain is used from the * inbound packet side. */ if (inp_p != NULL) { stcb = sctp_findassociation_ep_addr(inp_p, from, netp, to, NULL); } else { stcb = sctp_findassociation_ep_addr(&inp, from, netp, to, NULL); } return (stcb); } /* * This routine will grub through the mbuf that is a INIT or INIT-ACK and * find all addresses that the sender has specified in any address list. Each * address will be used to lookup the TCB and see if one exits. */ static struct sctp_tcb * sctp_findassociation_special_addr(struct mbuf *m, int offset, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, struct sockaddr *dst) { struct sctp_paramhdr *phdr, param_buf; #if defined(INET) || defined(INET6) struct sctp_tcb *stcb; uint16_t ptype; #endif uint16_t plen; #ifdef INET struct sockaddr_in sin4; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif #ifdef INET memset(&sin4, 0, sizeof(sin4)); sin4.sin_len = sizeof(sin4); sin4.sin_family = AF_INET; sin4.sin_port = sh->src_port; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; sin6.sin6_port = sh->src_port; #endif offset += sizeof(struct sctp_init_chunk); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr != NULL) { /* now we must see if we want the parameter */ #if defined(INET) || defined(INET6) ptype = ntohs(phdr->param_type); #endif plen = ntohs(phdr->param_length); if (plen == 0) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS && plen == sizeof(struct sctp_ipv4addr_param)) { /* Get the rest of the address */ struct sctp_ipv4addr_param ip4_param, *p4; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip4_param, sizeof(ip4_param)); if (phdr == NULL) { return (NULL); } p4 = (struct sctp_ipv4addr_param *)phdr; memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin4, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS && plen == sizeof(struct sctp_ipv6addr_param)) { /* Get the rest of the address */ struct sctp_ipv6addr_param ip6_param, *p6; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip6_param, sizeof(ip6_param)); if (phdr == NULL) { return (NULL); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin6, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif offset += SCTP_SIZE32(plen); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } return (NULL); } static struct sctp_tcb * sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport, uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag) { /* * Use my vtag to hash. If we find it we then verify the source addr * is in the assoc. If all goes well we save a bit on rec of a * packet. */ struct sctpasochead *head; struct sctp_nets *net; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { SCTP_INP_RLOCK(stcb->sctp_ep); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } if (stcb->sctp_ep->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } SCTP_TCB_LOCK(stcb); SCTP_INP_RUNLOCK(stcb->sctp_ep); if (stcb->asoc.my_vtag == vtag) { /* candidate */ if (stcb->rport != rport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->sctp_ep->sctp_lport != lport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } /* RRS:Need toaddr check here */ if (sctp_does_stcb_own_this_addr(stcb, to) == 0) { /* Endpoint does not own this address */ SCTP_TCB_UNLOCK(stcb); continue; } if (remote_tag) { /* * If we have both vtags that's all we match * on */ if (stcb->asoc.peer_vtag == remote_tag) { /* * If both tags match we consider it * conclusive and check NO * source/destination addresses */ goto conclusive; } } if (skip_src_check) { conclusive: if (from) { *netp = sctp_findnet(stcb, from); } else { *netp = NULL; /* unknown */ } if (inp_p) *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } net = sctp_findnet(stcb, from); if (net) { /* yep its him. */ *netp = net; SCTP_STAT_INCR(sctps_vtagexpress); *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { /* * not him, this should only happen in rare * cases so I peg it. */ SCTP_STAT_INCR(sctps_vtagbogus); } } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_INFO_RUNLOCK(); return (NULL); } /* * Find an association with the pointer to the inbound IP packet. This can be * a IPv4 or IPv6 packet. */ struct sctp_tcb * sctp_findassociation_addr(struct mbuf *m, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; struct sctp_inpcb *inp; if (sh->v_tag) { /* we only go down this path if vtag is non-zero */ stcb = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0); if (stcb) { return (stcb); } } if (inp_p) { stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp, 1, vrf_id); inp = *inp_p; } else { stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp, 1, vrf_id); } SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp); if (stcb == NULL && inp) { /* Found a EP but not this address */ if ((ch->chunk_type == SCTP_INITIATION) || (ch->chunk_type == SCTP_INITIATION_ACK)) { /*- * special hook, we do NOT return linp or an * association that is linked to an existing * association that is under the TCP pool (i.e. no * listener exists). The endpoint finding routine * will always find a listener before examining the * TCP pool. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) { if (inp_p) { *inp_p = NULL; } return (NULL); } stcb = sctp_findassociation_special_addr(m, offset, sh, &inp, netp, dst); if (inp_p != NULL) { *inp_p = inp; } } } SCTPDBG(SCTP_DEBUG_PCB1, "stcb is %p\n", (void *)stcb); return (stcb); } /* * lookup an association by an ASCONF lookup address. * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup */ struct sctp_tcb * sctp_findassociation_ep_asconf(struct mbuf *m, int offset, struct sockaddr *dst, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; union sctp_sockstore remote_store; struct sctp_paramhdr param_buf, *phdr; int ptype; int zero_address = 0; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif memset(&remote_store, 0, sizeof(remote_store)); phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), ¶m_buf, sizeof(struct sctp_paramhdr)); if (phdr == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n", __func__); return NULL; } ptype = (int)((uint32_t)ntohs(phdr->param_type)); /* get the correlation address */ switch (ptype) { #ifdef INET6 case SCTP_IPV6_ADDRESS: { /* ipv6 address param */ struct sctp_ipv6addr_param *p6, p6_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) { return NULL; } p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p6_buf.ph, sizeof(p6_buf)); if (p6 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n", __func__); return (NULL); } sin6 = &remote_store.sin6; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = sh->src_port; memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; break; } #endif #ifdef INET case SCTP_IPV4_ADDRESS: { /* ipv4 address param */ struct sctp_ipv4addr_param *p4, p4_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) { return NULL; } p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p4_buf.ph, sizeof(p4_buf)); if (p4 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n", __func__); return (NULL); } sin = &remote_store.sin; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = sh->src_port; memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr)); if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; break; } #endif default: /* invalid address param type */ return NULL; } if (zero_address) { stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 1, vrf_id, 0); if (stcb != NULL) { SCTP_INP_DECR_REF(*inp_p); } } else { stcb = sctp_findassociation_ep_addr(inp_p, &remote_store.sa, netp, dst, NULL); } return (stcb); } /* * allocate a sctp_inpcb and setup a temporary binding to a port/all * addresses. This way if we don't get a bind we by default pick a ephemeral * port with all addresses bound. */ int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) { /* * we get called when a new endpoint starts up. We need to allocate * the sctp_inpcb structure from the zone and init it. Mark it as * unbound and find a port that we can use as an ephemeral with * INADDR_ANY. If the user binds later no problem we can then add in * the specific addresses. And setup the default parameters for the * EP. */ int i, error; struct sctp_inpcb *inp; struct sctp_pcb *m; struct timeval time; sctp_sharedkey_t *null_key; error = 0; SCTP_INP_INFO_WLOCK(); inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb); if (inp == NULL) { SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n"); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); return (ENOBUFS); } /* zap it */ memset(inp, 0, sizeof(*inp)); /* bump generations */ /* setup socket pointers */ inp->sctp_socket = so; inp->ip_inp.inp.inp_socket = so; inp->ip_inp.inp.inp_cred = crhold(so->so_cred); #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { if (MODULE_GLOBAL(ip6_auto_flowlabel)) { inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL; } if (MODULE_GLOBAL(ip6_v6only)) { inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY; } } #endif inp->sctp_associd_counter = 1; inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT; inp->sctp_frag_point = 0; inp->max_cwnd = 0; inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off); inp->ecn_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_ecn_enable); inp->prsctp_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pr_enable); inp->auth_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_auth_enable); inp->asconf_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_asconf_enable); inp->reconfig_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_reconfig_enable); inp->nrsack_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_nrsack_enable); inp->pktdrop_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pktdrop_enable); inp->idata_supported = 0; inp->rcv_edmid = SCTP_EDMID_NONE; inp->fibnum = so->so_fibnum; /* init the small hash table we use to track asocid <-> tcb */ inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark); if (inp->sctp_asocidhash == NULL) { crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_INP_INFO_WUNLOCK(); return (ENOBUFS); } SCTP_INCR_EP_COUNT(); inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); SCTP_INP_INFO_WUNLOCK(); so->so_pcb = (caddr_t)inp; if (SCTP_SO_TYPE(so) == SOCK_SEQPACKET) { /* UDP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure it is NON-BLOCKING IO for UDP */ /* SCTP_SET_SO_NBIO(so); */ } else if (SCTP_SO_TYPE(so) == SOCK_STREAM) { /* TCP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure we have blocking IO by default */ SOCK_LOCK(so); SCTP_CLEAR_SO_NBIO(so); SOCK_UNLOCK(so); } else { /* * unsupported socket type (RAW, etc)- in case we missed it * in protosw */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (EOPNOTSUPP); } if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize), &inp->sctp_hashmark); if (inp->sctp_tcbhash == NULL) { SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (ENOBUFS); } inp->def_vrf_id = vrf_id; SCTP_INP_INFO_WLOCK(); SCTP_INP_LOCK_INIT(inp); rw_init_flags(&inp->ip_inp.inp.inp_lock, "sctpinp", RW_RECURSE | RW_DUPOK); SCTP_INP_READ_LOCK_INIT(inp); SCTP_ASOC_CREATE_LOCK_INIT(inp); /* lock the new ep */ SCTP_INP_WLOCK(inp); /* add it to the info area */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list); SCTP_INP_INFO_WUNLOCK(); TAILQ_INIT(&inp->read_queue); LIST_INIT(&inp->sctp_addr_list); LIST_INIT(&inp->sctp_asoc_list); #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_INIT(&inp->sctp_asoc_free_list); #endif /* Init the timer structure for signature change */ SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer); inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE; /* now init the actual endpoint default data */ m = &inp->sctp_ep; /* setup the base timeout information */ m->sctp_timeoutticks[SCTP_TIMER_SEND] = sctp_secs_to_ticks(SCTP_SEND_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_INIT] = sctp_secs_to_ticks(SCTP_INIT_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_RECV] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default)); m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default)); m->sctp_timeoutticks[SCTP_TIMER_PMTU] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default)); m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default)); m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default)); /* all max/min max are in ms */ m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default); m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default); m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); m->def_net_pf_threshold = SCTP_BASE_SYSCTL(sctp_path_pf_threshold); m->sctp_sws_sender = SCTP_SWS_SENDER_DEF; m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF; m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default); m->fr_max_burst = SCTP_BASE_SYSCTL(sctp_fr_max_burst_default); m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module); m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default); /* number of streams to pre-open on a association */ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); m->default_mtu = 0; /* Add adaptation cookie */ m->adaptation_layer_indicator = 0; m->adaptation_layer_indicator_provided = 0; /* seed random number generator */ m->random_counter = 1; m->store_at = SCTP_SIGNATURE_SIZE; SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers)); sctp_fill_random_store(m); /* Minimum cookie size */ m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) + sizeof(struct sctp_state_cookie); m->size_of_a_cookie += SCTP_SIGNATURE_SIZE; /* Setup the initial secret */ (void)SCTP_GETTIME_TIMEVAL(&time); m->time_of_secret_change = time.tv_sec; for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { m->secret_key[0][i] = sctp_select_initial_TSN(m); } sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL); /* How long is a cookie good for ? */ m->def_cookie_life = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default)); /* * Initialize authentication parameters */ m->local_hmacs = sctp_default_supported_hmaclist(); m->local_auth_chunks = sctp_alloc_chunklist(); if (inp->asconf_supported) { sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks); sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks); } m->default_dscp = 0; #ifdef INET6 m->default_flowlabel = 0; #endif m->port = 0; /* encapsulation disabled by default */ LIST_INIT(&m->shared_keys); /* add default NULL key as key id 0 */ null_key = sctp_alloc_sharedkey(); sctp_insert_sharedkey(&m->shared_keys, null_key); SCTP_INP_WUNLOCK(inp); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 12); #endif return (error); } void sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, struct sctp_tcb *stcb) { struct sctp_nets *net; uint16_t lport, rport; struct sctppcbhead *head; struct sctp_laddr *laddr, *oladdr; atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(old_inp); SCTP_INP_WLOCK(new_inp); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #ifdef INET6 if (old_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { new_inp->ip_inp.inp.inp_flags |= old_inp->ip_inp.inp.inp_flags & INP_CONTROLOPTS; if (old_inp->ip_inp.inp.in6p_outputopts) { new_inp->ip_inp.inp.in6p_outputopts = ip6_copypktopts(old_inp->ip_inp.inp.in6p_outputopts, M_NOWAIT); } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { new_inp->ip_inp.inp.inp_ip_tos = old_inp->ip_inp.inp.inp_ip_tos; new_inp->ip_inp.inp.inp_ip_ttl = old_inp->ip_inp.inp.inp_ip_ttl; } #endif new_inp->sctp_ep.time_of_secret_change = old_inp->sctp_ep.time_of_secret_change; memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key, sizeof(old_inp->sctp_ep.secret_key)); new_inp->sctp_ep.current_secret_number = old_inp->sctp_ep.current_secret_number; new_inp->sctp_ep.last_secret_number = old_inp->sctp_ep.last_secret_number; new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie; /* make it so new data pours into the new socket */ stcb->sctp_socket = new_inp->sctp_socket; stcb->sctp_ep = new_inp; /* Copy the port across */ lport = new_inp->sctp_lport = old_inp->sctp_lport; rport = stcb->rport; /* Pull the tcb from the old association */ LIST_REMOVE(stcb, sctp_tcbhash); LIST_REMOVE(stcb, sctp_tcblist); if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } /* Now insert the new_inp into the TCP connected hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; LIST_INSERT_HEAD(head, new_inp, sctp_hash); /* Its safe to access */ new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; /* Now move the tcb into the endpoint list */ LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist); /* * Question, do we even need to worry about the ep-hash since we * only have one connection? Probably not :> so lets get rid of it * and not suck up any kernel memory in that. */ if (stcb->asoc.in_asocid_hash) { struct sctpasochead *lhd; lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id, new_inp->hashasocidmark)]; LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash); } /* Ok. Let's restart timer. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp, stcb, net); } SCTP_INP_INFO_WUNLOCK(); if (new_inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark); new_inp->sctp_tcbhash = NULL; } if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* Subset bound, so copy in the laddr list from the old_inp */ LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) { laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* * Gak, what can we do? This assoc is really * HOSED. We probably should send an abort * here. */ SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n"); continue; } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = oladdr->ifa; atomic_add_int(&laddr->ifa->refcount, 1); LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr, sctp_nxt_addr); new_inp->laddr_count++; if (oladdr == stcb->asoc.last_used_address) { stcb->asoc.last_used_address = laddr; } } } /* Now any running timers need to be adjusted. */ if (stcb->asoc.dack_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.dack_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.asconf_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.asconf_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.strreset_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.strreset_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.shut_guard_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.shut_guard_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.autoclose_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.autoclose_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.delete_prim_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.delete_prim_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } /* now what about the nets? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->pmtu_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->pmtu_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->hb_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->hb_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->rxt_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->rxt_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } } SCTP_INP_WUNLOCK(new_inp); SCTP_INP_WUNLOCK(old_inp); } /* * insert an laddr entry with the given ifa for the desired list */ static int sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) { struct sctp_laddr *laddr; laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = ifa; laddr->action = act; atomic_add_int(&ifa->refcount, 1); /* insert it */ LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); return (0); } /* * Remove an laddr entry from the local address list (on an assoc) */ static void sctp_remove_laddr(struct sctp_laddr *laddr) { /* remove from the list */ LIST_REMOVE(laddr, sctp_nxt_addr); sctp_free_ifa(laddr->ifa); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); SCTP_DECR_LADDR_COUNT(); } /* * Bind the socket, with the PCB and global info locks held. Note, if a * socket address is specified, the PCB lock may be dropped and re-acquired. * * sctp_ifap is used to bypass normal local address validation checks. */ int sctp_inpcb_bind_locked(struct sctp_inpcb *inp, struct sockaddr *addr, struct sctp_ifa *sctp_ifap, struct thread *td) { /* bind a ep to a socket address */ struct sctppcbhead *head; struct sctp_inpcb *inp_tmp; struct inpcb *ip_inp; int port_reuse_active = 0; int bindall; uint16_t lport; int error; uint32_t vrf_id; KASSERT(td != NULL, ("%s: null thread", __func__)); error = 0; lport = 0; bindall = 1; ip_inp = &inp->ip_inp.inp; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); #ifdef SCTP_DEBUG if (addr) { SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port: %d\n", ntohs(((struct sockaddr_in *)addr)->sin_port)); SCTPDBG(SCTP_DEBUG_PCB1, "Addr: "); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); } #endif if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { error = EINVAL; /* already did a bind, subsequent binds NOT allowed ! */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (addr != NULL) { switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; /* IPV6_V6ONLY socket? */ if (SCTP_IPV6_V6ONLY(inp)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (addr->sa_len != sizeof(*sin)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } sin = (struct sockaddr_in *)addr; lport = sin->sin_port; /* * For LOOPBACK the prison_local_ip4() call * will transmute the ip address to the * proper value. */ if ((error = prison_local_ip4(td->td_ucred, &sin->sin_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (sin->sin_addr.s_addr != INADDR_ANY) { bindall = 0; } break; } #endif #ifdef INET6 case AF_INET6: { /* * Only for pure IPv6 Address. (No IPv4 * Mapped!) */ struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (addr->sa_len != sizeof(*sin6)) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } lport = sin6->sin6_port; /* * For LOOPBACK the prison_local_ip6() call * will transmute the ipv6 address to the * proper value. */ if ((error = prison_local_ip6(td->td_ucred, &sin6->sin6_addr, (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { bindall = 0; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; break; } #endif default: error = EAFNOSUPPORT; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } /* Setup a vrf_id to be the default for the non-bind-all case. */ vrf_id = inp->def_vrf_id; if (lport) { /* * Did the caller specify a port? if so we must see if an ep * already has this one bound. */ /* got to be root to get at low ports */ if (ntohs(lport) < IPPORT_RESERVED && (error = priv_check(td, PRIV_NETINET_RESERVEDPORT)) != 0) { goto out; } SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); if (bindall) { vrf_id = inp->def_vrf_id; inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } else { inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } continue_anyway: SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); if (bindall) { /* verify that no lport is not used by a singleton */ if ((port_reuse_active == 0) && (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))) { /* Sorry someone already has this one bound */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { port_reuse_active = 1; } else { error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } } } else { uint16_t first, last, candidate; uint16_t count; if (ip_inp->inp_flags & INP_HIGHPORT) { first = MODULE_GLOBAL(ipport_hifirstauto); last = MODULE_GLOBAL(ipport_hilastauto); } else if (ip_inp->inp_flags & INP_LOWPORT) { if ((error = priv_check(td, PRIV_NETINET_RESERVEDPORT)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } first = MODULE_GLOBAL(ipport_lowfirstauto); last = MODULE_GLOBAL(ipport_lowlastauto); } else { first = MODULE_GLOBAL(ipport_firstauto); last = MODULE_GLOBAL(ipport_lastauto); } if (first > last) { uint16_t temp; temp = first; first = last; last = temp; } count = last - first + 1; /* number of candidates */ candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count); for (;;) { if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) { lport = htons(candidate); break; } if (--count == 0) { error = EADDRINUSE; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } if (candidate == last) candidate = first; else candidate = candidate + 1; } } if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* * this really should not happen. The guy did a non-blocking * bind and then did a close at the same time. */ error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } /* ok we look clear to give out this port, so lets setup the binding */ if (bindall) { /* binding to all addresses, so just set in the proper flags */ inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL; /* set the automatic addr changes from kernel flag */ if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } /* * set the automatic mobility_base from kernel flag (by * micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } /* * set the automatic mobility_fasthandoff from kernel flag * (by micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } } else { /* * bind specific, make sure flags is off and add a new * address structure to the sctp_addr_list inside the ep * structure. * * We will need to allocate one and insert it at the head. * The socketopt call can just insert new addresses in there * as well. It will also have to do the embed scope kame * hack too (before adding). */ struct sctp_ifa *ifa; union sctp_sockstore store; memset(&store, 0, sizeof(store)); switch (addr->sa_family) { #ifdef INET case AF_INET: memcpy(&store.sin, addr, sizeof(struct sockaddr_in)); store.sin.sin_port = 0; break; #endif #ifdef INET6 case AF_INET6: memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6)); store.sin6.sin6_port = 0; break; #endif default: break; } /* * first find the interface with the bound address need to * zero out the port to find the address! yuck! can't do * this earlier since need port for sctp_pcb_findep() */ if (sctp_ifap != NULL) { ifa = sctp_ifap; } else { /* * Note for BSD we hit here always other O/S's will * pass things in via the sctp_ifap argument. */ ifa = sctp_find_ifa_by_addr(&store.sa, vrf_id, SCTP_ADDR_NOT_LOCKED); } if (ifa == NULL) { error = EADDRNOTAVAIL; /* Can't find an interface with that address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } #ifdef INET6 if (addr->sa_family == AF_INET6) { /* GAK, more FIXME IFA lock? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } } #endif /* we're not bound all */ inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL; /* allow bindx() to send ASCONF's for binding changes */ sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); /* clear automatic addr changes from kernel flag */ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); /* add this address to the endpoint list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0); if (error != 0) goto out; inp->laddr_count++; } /* find the bucket */ if (port_reuse_active) { /* Put it into tcp 1-2-1 hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))]; inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; } else { head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; } /* put it in the bucket */ LIST_INSERT_HEAD(head, inp, sctp_hash); SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n", (void *)head, ntohs(lport), port_reuse_active); /* set in the port */ inp->sctp_lport = lport; /* turn off just the unbound flag */ KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) != 0, ("%s: inp %p is already bound", __func__, inp)); inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; out: return (error); } int sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, struct sctp_ifa *sctp_ifap, struct thread *td) { struct sctp_inpcb *inp; int error; inp = so->so_pcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); error = sctp_inpcb_bind_locked(inp, addr, sctp_ifap, td); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (error); } static void sctp_iterator_inp_being_freed(struct sctp_inpcb *inp) { struct sctp_iterator *it, *nit; /* * We enter with the only the ITERATOR_LOCK in place and a write * lock on the inp_info stuff. */ it = sctp_it_ctl.cur_it; if (it && (it->vn != curvnet)) { /* Its not looking at our VNET */ return; } if (it && (it->inp == inp)) { /* * This is tricky and we hold the iterator lock, but when it * returns and gets the lock (when we release it) the * iterator will try to operate on inp. We need to stop that * from happening. But of course the iterator has a * reference on the stcb and inp. We can mark it and it will * stop. * * If its a single iterator situation, we set the end * iterator flag. Otherwise we set the iterator to go to the * next inp. * */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } else { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP; } } /* * Now go through and remove any single reference to our inp that * may be still pending on the list */ SCTP_IPI_ITERATOR_WQ_LOCK(); TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } if (it->inp == inp) { /* This one points to me is it inp specific? */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { /* Remove and free this one */ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } else { it->inp = LIST_NEXT(it->inp, sctp_list); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } } /* * When its put in the refcnt is incremented so decr * it */ SCTP_INP_DECR_REF(inp); } } SCTP_IPI_ITERATOR_WQ_UNLOCK(); } /* release sctp_inpcb unbind the port */ void sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) { /* * Here we free a endpoint. We must find it (if it is in the Hash * table) and remove it from there. Then we must also find it in the * overall list and remove it from there. After all removals are * complete then any timer has to be stopped. Then start the actual * freeing. a) Any local lists. b) Any associations. c) The hash of * all associations. d) finally the ep itself. */ struct sctp_tcb *stcb, *nstcb; struct sctp_laddr *laddr, *nladdr; struct inpcb *ip_pcb; struct socket *so; int being_refed = 0; struct sctp_queued_to_read *sq, *nsq; int cnt; sctp_sharedkey_t *shared_key, *nshared_key; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 0); #endif SCTP_ITERATOR_LOCK(); /* mark any iterators on the list or being processed */ sctp_iterator_inp_being_freed(inp); SCTP_ITERATOR_UNLOCK(); SCTP_ASOC_CREATE_LOCK(inp); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); so = inp->sctp_socket; KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) != 0, ("%s: inp %p still has socket", __func__, inp)); KASSERT((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0, ("%s: double free of inp %p", __func__, inp)); if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP; /* socket is gone, so no more wakeups allowed */ inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; } /* First time through we have the socket lock, after that no more. */ sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_1); if (inp->control) { sctp_m_freem(inp->control); inp->control = NULL; } if (inp->pkt) { sctp_m_freem(inp->pkt); inp->pkt = NULL; } ip_pcb = &inp->ip_inp.inp; /* we could just cast the main pointer * here but I will be nice :> (i.e. * ip_pcb = ep;) */ if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { int cnt_in_sd; cnt_in_sd = 0; LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_list, sctp_tcblist, nstcb) { SCTP_TCB_LOCK(stcb); /* Disconnect the socket please. */ stcb->sctp_socket = NULL; SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_CLOSED_SOCKET); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* Skip guys being freed */ cnt_in_sd++; if (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { /* * Special case - we did not start a * kill timer on the asoc due to it * was not closed. So go ahead and * start it now. */ SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } SCTP_TCB_UNLOCK(stcb); continue; } if (((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) && (stcb->asoc.total_output_queue_size == 0)) { /* * If we have data in queue, we don't want * to just free since the app may have done, * send()/close or connect/send/close. And * it wants the data to get across first. */ /* Just abandon things in the front states */ if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) { cnt_in_sd++; } continue; } if ((stcb->asoc.size_on_reasm_queue > 0) || (stcb->asoc.size_on_all_streams > 0) || ((so != NULL) && (SCTP_SBAVAIL(&so->so_rcv) > 0))) { /* Left with Data unread */ struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) { cnt_in_sd++; } continue; } else if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { if ((*stcb->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, &stcb->asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* * there is nothing queued to send, * so I send shutdown */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED); } } else { /* mark into shutdown pending */ SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if ((*stcb->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, &stcb->asoc)) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) { cnt_in_sd++; } continue; } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); } } cnt_in_sd++; SCTP_TCB_UNLOCK(stcb); } /* now is there some left in our SHUTDOWN state? */ if (cnt_in_sd) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 2); #endif inp->sctp_socket = NULL; SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } } inp->sctp_socket = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { /* * ok, this guy has been bound. It's port is somewhere in * the SCTP_BASE_INFO(hash table). Remove it! */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND; } /* * If there is a timer running to kill us, forget it, since it may * have a contest on the INP lock.. which would cause us to die ... */ cnt = 0; LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_list, sctp_tcblist, nstcb) { SCTP_TCB_LOCK(stcb); if (immediate != SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { /* Disconnect the socket please */ stcb->sctp_socket = NULL; SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_CLOSED_SOCKET); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { if (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } cnt++; SCTP_TCB_UNLOCK(stcb); continue; } /* Free associations that are NOT killing us */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) && ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) { struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); } else if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { cnt++; SCTP_TCB_UNLOCK(stcb); continue; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { cnt++; } } if (cnt) { /* Ok we have someone out there that will kill us */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 3); #endif SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } if (SCTP_INP_LOCK_CONTENDED(inp)) being_refed++; if (SCTP_INP_READ_CONTENDED(inp)) being_refed++; if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp)) being_refed++; /* NOTE: 0 refcount also means no timers are referencing us. */ if ((inp->refcount) || (being_refed) || (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 4); #endif sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } inp->sctp_ep.signature_change.type = 0; inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE; /* * Remove it from the list .. last thing we need a lock for. */ LIST_REMOVE(inp, sctp_list); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 5); #endif if ((inp->sctp_asocidhash) != NULL) { SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark); inp->sctp_asocidhash = NULL; } /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(sq, &inp->read_queue, next, nsq) { /* Its only abandoned if it had data left */ if (sq->length) SCTP_STAT_INCR(sctps_left_abandon); TAILQ_REMOVE(&inp->read_queue, sq, next); sctp_free_remote_addr(sq->whoFrom); if (so) SCTP_SB_DECR(&so->so_rcv, sq->length); if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } /* * no need to free the net count, since at this point all * assoc's are gone. */ sctp_free_a_readq(NULL, sq); } /* Now the sctp_pcb things */ /* * free each asoc if it is not already closed/free. we can't use the * macro here since le_next will get freed as part of the * sctp_free_assoc() call. */ if (ip_pcb->inp_options) { (void)sctp_m_free(ip_pcb->inp_options); ip_pcb->inp_options = 0; } #ifdef INET6 if (ip_pcb->inp_vflag & INP_IPV6) { ip6_freepcbopts(ip_pcb->in6p_outputopts); } #endif /* INET6 */ ip_pcb->inp_vflag = 0; /* free up authentication fields */ if (inp->sctp_ep.local_auth_chunks != NULL) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); if (inp->sctp_ep.local_hmacs != NULL) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); LIST_FOREACH_SAFE(shared_key, &inp->sctp_ep.shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* * if we have an address list the following will free the list of * ifaddr's that are set into this ep. Again macro limitations here, * since the LIST_FOREACH could be a bad idea. */ LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { sctp_remove_laddr(laddr); } #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_FOREACH_SAFE(stcb, &inp->sctp_asoc_free_list, sctp_tcblist, nstcb) { LIST_REMOVE(stcb, sctp_tcblist); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); } /* *** END TEMP CODE *** */ #endif /* Now lets see about freeing the EP hash table. */ if (inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark); inp->sctp_tcbhash = NULL; } /* Now we must put the ep memory back into the zone pool */ crfree(inp->ip_inp.inp.inp_cred); INP_LOCK_DESTROY(&inp->ip_inp.inp); SCTP_INP_LOCK_DESTROY(inp); SCTP_INP_READ_LOCK_DESTROY(inp); SCTP_ASOC_CREATE_LOCK_DESTROY(inp); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_DECR_EP_COUNT(); } struct sctp_nets * sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr) { struct sctp_nets *net; /* locate the address */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr)) return (net); } return (NULL); } int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id) { struct sctp_ifa *sctp_ifa; sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED); if (sctp_ifa) { return (1); } else { return (0); } } /* * add's a remote endpoint address, done with the INIT/INIT-ACK as well as * when a ASCONF arrives that adds it. It will also initialize all the cwnd * stats of stuff. */ int sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, struct sctp_nets **netp, uint16_t port, int set_scope, int from) { /* * The following is redundant to the same lines in the * sctp_aloc_assoc() but is needed since others call the add address * function */ struct sctp_nets *net, *netfirst; int addr_inscope; SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ", from); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr); netfirst = sctp_findnet(stcb, newaddr); if (netfirst) { /* * Lie and return ok, we don't want to make the association * go away for this behavior. It will happen in the TCP * model in a connected socket. It does not reach the hash * table until after the association is built so it can't be * found. Mark as reachable, since the initial creation will * have been cleared and the NOT_IN_ASSOC flag will have * been added... and we don't want to end up removing it * back out. */ if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) { netfirst->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED); } else { netfirst->dest_state = SCTP_ADDR_REACHABLE; } return (0); } addr_inscope = 1; switch (newaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)newaddr; if (sin->sin_addr.s_addr == 0) { /* Invalid address */ return (-1); } /* zero out the zero area */ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); /* assure len is set */ sin->sin_len = sizeof(struct sockaddr_in); if (set_scope) { if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { stcb->asoc.scope.ipv4_local_scope = 1; } } else { /* Validate the address is in scope */ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) && (stcb->asoc.scope.ipv4_local_scope == 0)) { addr_inscope = 0; } } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)newaddr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Invalid address */ return (-1); } /* assure len is set */ sin6->sin6_len = sizeof(struct sockaddr_in6); if (set_scope) { if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { /* * If the new destination is a * LINK_LOCAL we must have common * site scope. Don't set the local * scope since we may not share all * links, only loopback can do this. * Links on the local network would * also be on our private network * for v4 too. */ stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { /* * If the new destination is * SITE_LOCAL then we must have site * scope in common. */ stcb->asoc.scope.site_scope = 1; } } else { /* Validate the address is in scope */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) && (stcb->asoc.scope.loopback_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.local_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.site_scope == 0)) { addr_inscope = 0; } } break; } #endif default: /* not supported family type */ return (-1); } net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets); if (net == NULL) { return (-1); } SCTP_INCR_RADDR_COUNT(); memset(net, 0, sizeof(struct sctp_nets)); (void)SCTP_GETTIME_TIMEVAL(&net->start_time); memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len); switch (newaddr->sa_family) { #ifdef INET case AF_INET: ((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport; break; #endif #ifdef INET6 case AF_INET6: ((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport; break; #endif default: break; } net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id); if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.site_scope = 1; addr_inscope = 1; } net->failure_threshold = stcb->asoc.def_net_failure; net->pf_threshold = stcb->asoc.def_net_pf_threshold; if (addr_inscope == 0) { net->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_OUT_OF_SCOPE); } else { if (from == SCTP_ADDR_IS_CONFIRMED) /* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */ net->dest_state = SCTP_ADDR_REACHABLE; else net->dest_state = SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED; } /* * We set this to 0, the timer code knows that this means its an * initial value */ net->rto_needed = 1; net->RTO = 0; net->RTO_measured = 0; stcb->asoc.numnets++; net->ref_count = 1; net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1; net->port = port; net->dscp = stcb->asoc.default_dscp; #ifdef INET6 net->flowlabel = stcb->asoc.default_flowlabel; #endif if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { net->dest_state |= SCTP_ADDR_NOHB; } else { net->dest_state &= ~SCTP_ADDR_NOHB; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) { net->dest_state |= SCTP_ADDR_NO_PMTUD; } else { net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } net->heart_beat_delay = stcb->asoc.heart_beat_delay; /* Init the timer structure */ SCTP_OS_TIMER_INIT(&net->rxt_timer.timer); SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer); SCTP_OS_TIMER_INIT(&net->hb_timer.timer); /* Now generate a route for this guy */ #ifdef INET6 /* KAME hack: embed scopeid */ if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); sin6->sin6_scope_id = 0; } #endif SCTP_RTALLOC((sctp_route_t *)&net->ro, stcb->asoc.vrf_id, stcb->sctp_ep->fibnum); net->src_addr_selected = 0; if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) { /* Get source address */ net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep, stcb, (sctp_route_t *)&net->ro, net, 0, stcb->asoc.vrf_id); if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else if (net->ro._s_addr != NULL) { uint32_t imtu, rmtu, hcmtu; net->src_addr_selected = 1; /* Now get the interface MTU */ if (net->ro._s_addr->ifn_p != NULL) { /* * XXX: Should we here just use * net->ro._s_addr->ifn_p->ifn_mtu */ imtu = SCTP_GATHER_MTU_FROM_IFN_INFO(net->ro._s_addr->ifn_p->ifn_p, net->ro._s_addr->ifn_p->ifn_index); } else { imtu = 0; } rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh); hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum); net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu); } } if (net->mtu == 0) { if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else { switch (newaddr->sa_family) { #ifdef INET case AF_INET: net->mtu = SCTP_DEFAULT_MTU; break; #endif #ifdef INET6 case AF_INET6: net->mtu = 1280; break; #endif default: break; } } } #if defined(INET) || defined(INET6) if (net->port) { net->mtu -= (uint32_t)sizeof(struct udphdr); } #endif if (from == SCTP_ALLOC_ASOC) { stcb->asoc.smallest_mtu = net->mtu; } if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu, true); } #ifdef INET6 if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_recoverscope(sin6); } #endif /* JRS - Use the congestion control given in the CC module */ if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) (*stcb->asoc.cc_functions.sctp_set_initial_cc_param) (stcb, net); /* * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning * of assoc (2005/06/27, iyengar@cis.udel.edu) */ net->find_pseudo_cumack = 1; net->find_rtx_pseudo_cumack = 1; /* Choose an initial flowid. */ net->flowid = stcb->asoc.my_vtag ^ ntohs(stcb->rport) ^ ntohs(stcb->sctp_ep->sctp_lport); net->flowtype = M_HASHTYPE_OPAQUE_HASH; if (netp) { *netp = net; } netfirst = TAILQ_FIRST(&stcb->asoc.nets); if (net->ro.ro_nh == NULL) { /* Since we have no route put it at the back */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); } else if (netfirst == NULL) { /* We are the first one in the pool. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (netfirst->ro.ro_nh == NULL) { /* * First one has NO route. Place this one ahead of the first * one. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) { /* * This one has a different interface than the one at the * top of the list. Place it ahead. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else { /* * Ok we have the same interface as the first one. Move * forward until we find either a) one with a NULL route... * insert ahead of that b) one with a different ifp.. insert * after that. c) end of the list.. insert at the tail. */ struct sctp_nets *netlook; do { netlook = TAILQ_NEXT(netfirst, sctp_next); if (netlook == NULL) { /* End of the list */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); break; } else if (netlook->ro.ro_nh == NULL) { /* next one has NO route */ TAILQ_INSERT_BEFORE(netfirst, net, sctp_next); break; } else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) { TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook, net, sctp_next); break; } /* Shift forward */ netfirst = netlook; } while (netlook != NULL); } /* got to have a primary set */ if (stcb->asoc.primary_destination == 0) { stcb->asoc.primary_destination = net; } else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) && (net->ro.ro_nh) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) { /* No route to current primary adopt new primary */ stcb->asoc.primary_destination = net; } /* Validate primary is first */ net = TAILQ_FIRST(&stcb->asoc.nets); if ((net != stcb->asoc.primary_destination) && (stcb->asoc.primary_destination)) { /* * first one on the list is NOT the primary sctp_cmpaddr() * is much more efficient if the primary is the first on the * list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } static uint32_t sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb) { uint32_t id; struct sctpasochead *head; struct sctp_tcb *lstcb; try_again: if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* TSNH */ return (0); } /* * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC, * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC. */ if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) { inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1; } id = inp->sctp_associd_counter; inp->sctp_associd_counter++; lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t)id, 0); if (lstcb) { goto try_again; } head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash); stcb->asoc.in_asocid_hash = 1; return (id); } /* * allocate an association and add it to the endpoint. The caller must be * careful to add all additional addresses once they are know right away or * else the assoc will be may experience a blackout scenario. */ static struct sctp_tcb * sctp_aloc_assoc_locked(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { /* note the p argument is only valid in unbound sockets */ struct sctp_tcb *stcb; struct sctp_association *asoc; struct sctpasochead *head; uint16_t rport; int err; SCTP_INP_INFO_WLOCK_ASSERT(); SCTP_INP_WLOCK_ASSERT(inp); /* * Assumption made here: Caller has done a * sctp_findassociation_ep_addr(ep, addr's); to make sure the * address does not exist already. */ if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) { /* Hit max assoc, sorry no more */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } if (firstaddr == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) || (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { /* * If its in the TCP pool, its NOT allowed to create an * association. The parent listener needs to call * sctp_aloc_assoc.. or the one-2-many socket. If a peeled * off, or connected one does this.. its an error. */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } } SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:"); #ifdef SCTP_DEBUG if (firstaddr) { SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr); switch (firstaddr->sa_family) { #ifdef INET case AF_INET: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in *)firstaddr)->sin_port)); break; #endif #ifdef INET6 case AF_INET6: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in6 *)firstaddr)->sin6_port)); break; #endif default: break; } } else { SCTPDBG(SCTP_DEBUG_PCB3, "None\n"); } #endif /* SCTP_DEBUG */ switch (firstaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)firstaddr; if ((ntohs(sin->sin_port) == 0) || in_broadcast(sin->sin_addr) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (SCTP_IPV6_V6ONLY(inp) != 0))) { /* Invalid address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin->sin_port; break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)firstaddr; if ((ntohs(sin6->sin6_port) == 0) || IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0)) { /* Invalid address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin6->sin6_port; break; } #endif default: /* not supported family type */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { /* * If you have not performed a bind, then we need to do the * ephemeral bind for you. */ if ((err = sctp_inpcb_bind_locked(inp, NULL, NULL, p))) { /* bind error, probably perm */ *error = err; return (NULL); } } stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb); if (stcb == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); *error = ENOMEM; return (NULL); } SCTP_INCR_ASOC_COUNT(); memset(stcb, 0, sizeof(*stcb)); asoc = &stcb->asoc; SCTP_TCB_LOCK_INIT(stcb); stcb->rport = rport; /* setup back pointer's */ stcb->sctp_ep = inp; stcb->sctp_socket = inp->sctp_socket; if ((err = sctp_init_asoc(inp, stcb, override_tag, initial_tsn, vrf_id, o_streams))) { /* failed */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); *error = err; return (NULL); } SCTP_TCB_LOCK(stcb); asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); /* now that my_vtag is set, add it to the hash */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* put it in the bucket in the vtag hash of assoc's for the system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); if (sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC)) { /* failure.. memory error? */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } SCTP_DECR_ASOC_COUNT(); SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); LIST_REMOVE(stcb, sctp_asocs); LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } /* Init all the timers */ SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer); SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer); SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer); SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer); SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer); SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer); LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist); /* now file the port under the hash as well */ if (inp->sctp_tcbhash != NULL) { head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport, inp->sctp_hashmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbhash); } if (initialize_auth_params == SCTP_INITIALIZE_AUTH_PARAMS) { sctp_initialize_auth_params(inp, stcb); } SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb); return (stcb); } struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { struct sctp_tcb *stcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); stcb = sctp_aloc_assoc_locked(inp, firstaddr, error, override_tag, initial_tsn, vrf_id, o_streams, port, p, initialize_auth_params); SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); return (stcb); } struct sctp_tcb * sctp_aloc_assoc_connected(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t initial_tsn, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { struct sctp_tcb *stcb; SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && SCTP_IS_LISTENING(inp)) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); *error = EINVAL; return (NULL); } stcb = sctp_aloc_assoc_locked(inp, firstaddr, error, override_tag, initial_tsn, vrf_id, o_streams, port, p, initialize_auth_params); SCTP_INP_INFO_WUNLOCK(); if (stcb != NULL && (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) { inp->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; soisconnecting(inp->sctp_socket); } SCTP_INP_WUNLOCK(inp); return (stcb); } void sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_inpcb *inp; struct sctp_association *asoc; inp = stcb->sctp_ep; asoc = &stcb->asoc; asoc->numnets--; TAILQ_REMOVE(&asoc->nets, net, sctp_next); if (net == asoc->primary_destination) { /* Reset primary */ struct sctp_nets *lnet; lnet = TAILQ_FIRST(&asoc->nets); /* * Mobility adaptation Ideally, if deleted destination is * the primary, it becomes a fast retransmission trigger by * the subsequent SET PRIMARY. (by micchie) */ if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n"); if (asoc->deleted_primary != NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n"); goto out; } asoc->deleted_primary = net; atomic_add_int(&net->ref_count, 1); memset(&net->lastsa, 0, sizeof(net->lastsa)); memset(&net->lastsv, 0, sizeof(net->lastsv)); sctp_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED); sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL); } out: /* Try to find a confirmed primary */ asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0); } if (net == asoc->last_data_chunk_from) { /* Reset primary */ asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets); } if (net == asoc->last_control_chunk_from) { /* Clear net */ asoc->last_control_chunk_from = NULL; } if (net == asoc->last_net_cmt_send_started) { /* Clear net */ asoc->last_net_cmt_send_started = NULL; } if (net == stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_9); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_10); net->dest_state |= SCTP_ADDR_BEING_DELETED; sctp_free_remote_addr(net); } /* * remove a remote endpoint address from an association, it will fail if the * address does not exist. */ int sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr) { /* * Here we need to remove a remote address. This is quite simple, we * first find it in the list of address for the association * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE * on that item. Note we do not allow it to be removed if there are * no other addresses. */ struct sctp_association *asoc; struct sctp_nets *net, *nnet; asoc = &stcb->asoc; /* locate the address */ TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) { continue; } if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr, remaddr)) { /* we found the guy */ if (asoc->numnets < 2) { /* Must have at LEAST two remote addresses */ return (-1); } else { sctp_remove_net(stcb, net); return (0); } } } /* not found. */ return (-2); } static bool sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport, time_t now) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; int i; SCTP_INP_INFO_LOCK_ASSERT(); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].tv_sec_at_expire >= now) && (twait_block->vtag_block[i].v_tag == tag) && (twait_block->vtag_block[i].lport == lport) && (twait_block->vtag_block[i].rport == rport)) { return (true); } } } return (false); } static void sctp_set_vtag_block(struct sctp_timewait *vtag_block, time_t time, uint32_t tag, uint16_t lport, uint16_t rport) { vtag_block->tv_sec_at_expire = time; vtag_block->v_tag = tag; vtag_block->lport = lport; vtag_block->rport = rport; } static void sctp_add_vtag_to_timewait(uint32_t tag, uint16_t lport, uint16_t rport) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; struct timeval now; time_t time; int i; bool set; SCTP_INP_INFO_WLOCK_ASSERT(); (void)SCTP_GETTIME_TIMEVAL(&now); time = now.tv_sec + SCTP_BASE_SYSCTL(sctp_vtag_time_wait); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; set = false; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* Block(s) present, lets find space, and expire on the fly */ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].v_tag == 0) && !set) { sctp_set_vtag_block(twait_block->vtag_block + i, time, tag, lport, rport); set = true; continue; } if ((twait_block->vtag_block[i].v_tag != 0) && (twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { if (set) { /* Audit expires this guy */ sctp_set_vtag_block(twait_block->vtag_block + i, 0, 0, 0, 0); } else { /* Reuse it for the new tag */ sctp_set_vtag_block(twait_block->vtag_block + i, time, tag, lport, rport); set = true; } } } if (set) { /* * We only do up to the block where we can place our * tag for audits */ break; } } /* Need to add a new block to chain */ if (!set) { SCTP_MALLOC(twait_block, struct sctp_tagblock *, sizeof(struct sctp_tagblock), SCTP_M_TIMW); if (twait_block == NULL) { return; } memset(twait_block, 0, sizeof(struct sctp_tagblock)); LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock); sctp_set_vtag_block(twait_block->vtag_block, time, tag, lport, rport); } } void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh) { struct sctp_tmit_chunk *chk, *nchk; struct sctp_queued_to_read *control, *ncontrol; TAILQ_FOREACH_SAFE(control, rh, next_instrm, ncontrol) { TAILQ_REMOVE(rh, control, next_instrm); control->on_strm_q = 0; if (control->on_read_q == 0) { sctp_free_remote_addr(control->whoFrom); if (control->data) { sctp_m_freem(control->data); control->data = NULL; } } /* Reassembly free? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* * We don't free the address here since all the net's were * freed above. */ if (control->on_read_q == 0) { sctp_free_a_readq(stcb, control); } } } /*- * Free the association after un-hashing the remote port. This * function ALWAYS returns holding NO LOCK on the stcb. It DOES * expect that the input to this function IS a locked TCB. * It will return 0, if it did NOT destroy the association (instead * it unlocks it. It will return NON-zero if it either destroyed the * association OR the association is already destroyed. */ int sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location) { int i; struct sctp_association *asoc; struct sctp_nets *net, *nnet; struct sctp_laddr *laddr, *naddr; struct sctp_tmit_chunk *chk, *nchk; struct sctp_asconf_addr *aparam, *naparam; struct sctp_asconf_ack *aack, *naack; struct sctp_stream_reset_list *strrst, *nstrrst; struct sctp_queued_to_read *sq, *nsq; struct sctp_stream_queue_pending *sp, *nsp; sctp_sharedkey_t *shared_key, *nshared_key; struct socket *so; /* first, lets purge the entry from the hash table. */ SCTP_TCB_LOCK_ASSERT(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 6); #endif if (stcb->asoc.state == 0) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 7); #endif /* there is no asoc, really TSNH :-0 */ return (1); } if (stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } /* TEMP CODE */ if (stcb->freed_from_where == 0) { /* Only record the first place free happened from */ stcb->freed_from_where = from_location; } /* TEMP CODE */ asoc = &stcb->asoc; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; else so = inp->sctp_socket; /* * We used timer based freeing if a reader or writer is in the way. * So we first check if we are actually being called from a timer, * if so we abort early if a reader or writer is still in the way. */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) && (from_inpcbfree == SCTP_NORMAL_PROC)) { /* * is it the timer driving us? if so are the reader/writers * gone? */ if (stcb->asoc.refcnt) { /* nope, reader or writer in the way */ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); /* no asoc destroyed */ SCTP_TCB_UNLOCK(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 8); #endif return (0); } } /* Now clean up any other timers */ sctp_stop_association_timers(stcb, false); /* Now the read queue needs to be cleaned up (only once) */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_ABOUT_TO_BE_FREED); SCTP_INP_READ_LOCK(inp); TAILQ_FOREACH(sq, &inp->read_queue, next) { if (sq->stcb == stcb) { sq->do_not_ref_stcb = 1; sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn; /* * If there is no end, there never will be * now. */ if (sq->end_added == 0) { /* Held for PD-API, clear that. */ sq->pdapi_aborted = 1; sq->held_length = 0; if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, stcb, SCTP_PARTIAL_DELIVERY_ABORTED, (void *)sq, SCTP_SO_LOCKED); } /* Add an end to wake them */ sq->end_added = 1; } } } SCTP_INP_READ_UNLOCK(inp); if (stcb->block_entry) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET); stcb->block_entry->error = ECONNRESET; stcb->block_entry = NULL; } } if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) { /* * Someone holds a reference OR the socket is unaccepted * yet. */ if ((stcb->asoc.refcnt) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if (so) { /* Wake any reader/writers */ sctp_sorwakeup(inp, so); sctp_sowwakeup(inp, so); } SCTP_TCB_UNLOCK(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 9); #endif /* no asoc destroyed */ return (0); } #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 10); #endif /* * When I reach here, no others want to kill the assoc yet.. and I * own the lock. Now its possible an abort comes in when I do the * lock exchange below to grab all the locks to do the final take * out. to prevent this we increment the count, which will start a * timer and blow out above thus assuring us that we hold exclusive * killing of the asoc. Note that after getting back the TCB lock we * will go ahead and increment the counter back up and stop any * timer a passing stranger may have started :-S */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); SCTP_TCB_LOCK(stcb); } /* Double check the GONE flag */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* * For TCP type we need special handling when we are * connected. We also include the peel'ed off ones to. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED; inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED; if (so) { SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONNECTED); so->so_state |= SS_ISDISCONNECTED; socantrcvmore_locked(so); socantsendmore(so); sctp_sowwakeup(inp, so); sctp_sorwakeup(inp, so); SCTP_SOWAKEUP(so); } } } /* * Make it invalid too, that way if its about to run it will abort * and return. */ /* re-increment the lock */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_subtract_int(&stcb->asoc.refcnt, 1); } if (stcb->asoc.refcnt) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); } SCTP_TCB_UNLOCK(stcb); return (0); } asoc->state = 0; if (inp->sctp_tcbhash) { LIST_REMOVE(stcb, sctp_tcbhash); } if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } if (inp->sctp_socket == NULL) { stcb->sctp_socket = NULL; } /* Now lets remove it from the list of ALL associations in the EP */ LIST_REMOVE(stcb, sctp_tcblist); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); } /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); sctp_add_vtag_to_timewait(asoc->my_vtag, inp->sctp_lport, stcb->rport); /* * Now restop the timers to be sure this is paranoia at is finest! */ sctp_stop_association_timers(stcb, true); /* * The chunk lists and such SHOULD be empty but we check them just * in case. */ /* anything on the wheel needs to be removed */ for (i = 0; i < asoc->streamoutcnt; i++) { struct sctp_stream_out *outs; outs = &asoc->strmout[i]; /* now clean up any chunks here */ TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { if (so) { /* Still an open socket - report */ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, (void *)sp, SCTP_SO_LOCKED); } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; sp->length = 0; } } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); } } /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) { TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp); SCTP_FREE(strrst, SCTP_M_STRESET); } TAILQ_FOREACH_SAFE(sq, &asoc->pending_reply_queue, next, nsq) { TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next); if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } sctp_free_remote_addr(sq->whoFrom); sq->whoFrom = NULL; sq->stcb = NULL; /* Free the ctl entry */ sctp_free_a_readq(stcb, sq); /* sa_ignore FREED_MEMORY */ } TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) { TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); asoc->free_chunk_cnt--; /* sa_ignore FREED_MEMORY */ } /* pending send queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* sent queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) { if (chk->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } } TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } #ifdef INVARIANTS for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].chunks_on_queues > 0) { panic("%u chunks left for stream %u.", stcb->asoc.strmout[i].chunks_on_queues, i); } } #endif /* control queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* ASCONF queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } /* the stream outs */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } asoc->strm_realoutsize = asoc->streamoutcnt = 0; if (asoc->strmin) { for (i = 0; i < asoc->streamincnt; i++) { sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); asoc->strmin = NULL; } asoc->streamincnt = 0; TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { #ifdef INVARIANTS if (SCTP_BASE_INFO(ipi_count_raddr) == 0) { panic("no net's left alloc'ed, or list points to itself"); } #endif TAILQ_REMOVE(&asoc->nets, net, sctp_next); sctp_free_remote_addr(net); } LIST_FOREACH_SAFE(laddr, &asoc->sctp_restricted_addrs, sctp_nxt_addr, naddr) { /* sa_ignore FREED_MEMORY */ sctp_remove_laddr(laddr); } /* pending asconf (address) parameters */ TAILQ_FOREACH_SAFE(aparam, &asoc->asconf_queue, next, naparam) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_queue, aparam, next); SCTP_FREE(aparam, SCTP_M_ASC_ADDR); } TAILQ_FOREACH_SAFE(aack, &asoc->asconf_ack_sent, next, naack) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next); if (aack->data != NULL) { sctp_m_freem(aack->data); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack); } /* clean up auth stuff */ if (asoc->local_hmacs) sctp_free_hmaclist(asoc->local_hmacs); if (asoc->peer_hmacs) sctp_free_hmaclist(asoc->peer_hmacs); if (asoc->local_auth_chunks) sctp_free_chunklist(asoc->local_auth_chunks); if (asoc->peer_auth_chunks) sctp_free_chunklist(asoc->peer_auth_chunks); sctp_free_authinfo(&asoc->authinfo); LIST_FOREACH_SAFE(shared_key, &asoc->shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* Insert new items here :> */ /* Get rid of LOCK */ SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_RLOCK(inp); } #ifdef SCTP_TRACK_FREED_ASOCS if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* now clean up the tasoc itself */ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); } else { LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist); } #else SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); #endif if (from_inpcbfree == SCTP_NORMAL_PROC) { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * If its NOT the inp_free calling us AND sctp_close * as been called, we call back... */ SCTP_INP_RUNLOCK(inp); /* * This will start the kill timer (if we are the * last one) since we hold an increment yet. But * this is the only safe way to do this since * otherwise if the socket closes at the same time * we are here we might collide in the cleanup. */ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, SCTP_CALLED_DIRECTLY_NOCMPSET); SCTP_INP_DECR_REF(inp); } else { /* The socket is still open. */ SCTP_INP_DECR_REF(inp); SCTP_INP_RUNLOCK(inp); } } /* destroyed the asoc */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 11); #endif return (1); } /* * determine if a destination is "reachable" based upon the addresses bound * to the current endpoint (e.g. only v4 or v6 currently bound) */ /* * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use * assoc level v4/v6 flags, as the assoc *may* not have the same address * types bound as its endpoint */ int sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr) { struct sctp_inpcb *inp; int answer; /* * No locks here, the TCB, in all cases is already locked and an * assoc is up. There is either a INP lock by the caller applied (in * asconf case when deleting an address) or NOT in the HB case, * however if HB then the INP increment is up and the INP will not * be removed (on top of the fact that we have a TCB lock). So we * only want to read the sctp_flags, which is either bound-all or * not.. no protection needed since once an assoc is up you can't be * changing your binding. */ inp = stcb->sctp_ep; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* if bound all, destination is not restricted */ /* * RRS: Question during lock work: Is this correct? If you * are bound-all you still might need to obey the V4--V6 * flags??? IMO this bound-all stuff needs to be removed! */ return (1); } /* NOTE: all "scope" checks are done when local addresses are added */ switch (destaddr->sa_family) { #ifdef INET6 case AF_INET6: answer = inp->ip_inp.inp.inp_vflag & INP_IPV6; break; #endif #ifdef INET case AF_INET: answer = inp->ip_inp.inp.inp_vflag & INP_IPV4; break; #endif default: /* invalid family, so it's unreachable */ answer = 0; break; } return (answer); } /* * update the inp_vflags on an endpoint */ static void sctp_update_ep_vflag(struct sctp_inpcb *inp) { struct sctp_laddr *laddr; /* first clear the flag */ inp->ip_inp.inp.inp_vflag = 0; /* set the flag based on addresses on the ep list */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } switch (laddr->ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } } } /* * Add the address to the endpoint local address list There is nothing to be * done if we are bound to all addresses */ void sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action) { struct sctp_laddr *laddr; struct sctp_tcb *stcb; int fnd, error = 0; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-useable addr. */ return; } } #endif /* first, is it already present? */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd == 0) { /* Not in the ep list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action); if (error != 0) return; inp->laddr_count++; /* update inp_vflag flags */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { sctp_add_local_addr_restricted(stcb, ifa); } } return; } /* * select a new (hopefully reachable) destination net (should only be used * when we deleted an ep addr that is the only usable source address to reach * the destination net) */ static void sctp_select_primary_destination(struct sctp_tcb *stcb) { struct sctp_nets *net; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* for now, we'll just pick the first reachable one we find */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) continue; if (sctp_destination_is_reachable(stcb, (struct sockaddr *)&net->ro._l_addr)) { /* found a reachable destination */ stcb->asoc.primary_destination = net; } } /* I can't there from here! ...we're gonna die shortly... */ } /* * Delete the address from the endpoint local address list. There is nothing * to be done if we are bound to all addresses */ void sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; int fnd; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd && (inp->laddr_count < 2)) { /* can't delete unless there are at LEAST 2 addresses */ return; } if (fnd) { /* * clean up any use of this address go through our * associations and clear any last_used_address that match * this one for each assoc, see if a new primary_destination * is needed */ struct sctp_tcb *stcb; /* clean up "next_addr_touse" */ if (inp->next_addr_touse == laddr) /* delete this address */ inp->next_addr_touse = NULL; /* clean up "last_used_address" */ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { struct sctp_nets *net; SCTP_TCB_LOCK(stcb); if (stcb->asoc.last_used_address == laddr) /* delete this address */ stcb->asoc.last_used_address = NULL; /* * Now spin through all the nets and purge any ref * to laddr */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._s_addr == laddr->ifa) { /* Yep, purge src address selected */ RO_NHFREE(&net->ro); sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } } SCTP_TCB_UNLOCK(stcb); } /* for each tcb */ /* remove it from the ep list */ sctp_remove_laddr(laddr); inp->laddr_count--; /* update inp_vflag flags */ sctp_update_ep_vflag(inp); } return; } /* * Add the address to the TCB local address restricted list. * This is a "pending" address list (eg. addresses waiting for an * ASCONF-ACK response) and cannot be used as a valid source address. */ void sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; struct sctpladdr *list; /* * Assumes TCB is locked.. and possibly the INP. May need to * confirm/fix that if we need it and is not the case. */ list = &stcb->asoc.sctp_restricted_addrs; #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ return; } } #endif /* does the address already exist? */ LIST_FOREACH(laddr, list, sctp_nxt_addr) { if (laddr->ifa == ifa) { return; } } /* add to the list */ (void)sctp_insert_laddr(list, ifa, 0); return; } /* * Remove a local address from the TCB local address restricted list */ void sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; /* * This is called by asconf work. It is assumed that a) The TCB is * locked and b) The INP is locked. This is true in as much as I can * trace through the entry asconf code where I did these locks. * Again, the ASCONF code is a bit different in that it does lock * the INP during its work often times. This must be since we don't * want other proc's looking up things while what they are looking * up is changing :-D */ inp = stcb->sctp_ep; /* if subset bound and don't allow ASCONF's, can't delete last */ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { if (stcb->sctp_ep->laddr_count < 2) { /* can't delete last address */ return; } } LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { /* remove the address if it exists */ if (laddr->ifa == NULL) continue; if (laddr->ifa == ifa) { sctp_remove_laddr(laddr); return; } } /* address not found! */ return; } /* sysctl */ static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC; static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR; #if defined(SCTP_MCORE_INPUT) && defined(SMP) struct sctp_mcore_ctrl *sctp_mcore_workers = NULL; int *sctp_cpuarry = NULL; void sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use) { /* Queue a packet to a processor for the specified core */ struct sctp_mcore_queue *qent; struct sctp_mcore_ctrl *wkq; int need_wake = 0; if (sctp_mcore_workers == NULL) { /* Something went way bad during setup */ sctp_input_with_port(m, off, 0); return; } SCTP_MALLOC(qent, struct sctp_mcore_queue *, (sizeof(struct sctp_mcore_queue)), SCTP_M_MCORE); if (qent == NULL) { /* This is trouble */ sctp_input_with_port(m, off, 0); return; } qent->vn = curvnet; qent->m = m; qent->off = off; qent->v6 = 0; wkq = &sctp_mcore_workers[cpu_to_use]; SCTP_MCORE_QLOCK(wkq); TAILQ_INSERT_TAIL(&wkq->que, qent, next); if (wkq->running == 0) { need_wake = 1; } SCTP_MCORE_QUNLOCK(wkq); if (need_wake) { wakeup(&wkq->running); } } static void sctp_mcore_thread(void *arg) { struct sctp_mcore_ctrl *wkq; struct sctp_mcore_queue *qent; wkq = (struct sctp_mcore_ctrl *)arg; struct mbuf *m; int off, v6; /* Wait for first tickle */ SCTP_MCORE_LOCK(wkq); wkq->running = 0; msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); SCTP_MCORE_UNLOCK(wkq); /* Bind to our cpu */ thread_lock(curthread); sched_bind(curthread, wkq->cpuid); thread_unlock(curthread); /* Now lets start working */ SCTP_MCORE_LOCK(wkq); /* Now grab lock and go */ for (;;) { SCTP_MCORE_QLOCK(wkq); skip_sleep: wkq->running = 1; qent = TAILQ_FIRST(&wkq->que); if (qent) { TAILQ_REMOVE(&wkq->que, qent, next); SCTP_MCORE_QUNLOCK(wkq); CURVNET_SET(qent->vn); m = qent->m; off = qent->off; v6 = qent->v6; SCTP_FREE(qent, SCTP_M_MCORE); if (v6 == 0) { sctp_input_with_port(m, off, 0); } else { SCTP_PRINTF("V6 not yet supported\n"); sctp_m_freem(m); } CURVNET_RESTORE(); SCTP_MCORE_QLOCK(wkq); } wkq->running = 0; if (!TAILQ_EMPTY(&wkq->que)) { goto skip_sleep; } SCTP_MCORE_QUNLOCK(wkq); msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); } } static void sctp_startup_mcore_threads(void) { int i, cpu; if (mp_ncpus == 1) return; if (sctp_mcore_workers != NULL) { /* * Already been here in some previous vnet? */ return; } SCTP_MALLOC(sctp_mcore_workers, struct sctp_mcore_ctrl *, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl)), SCTP_M_MCORE); if (sctp_mcore_workers == NULL) { /* TSNH I hope */ return; } memset(sctp_mcore_workers, 0, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl))); /* Init the structures */ for (i = 0; i <= mp_maxid; i++) { TAILQ_INIT(&sctp_mcore_workers[i].que); SCTP_MCORE_LOCK_INIT(&sctp_mcore_workers[i]); SCTP_MCORE_QLOCK_INIT(&sctp_mcore_workers[i]); sctp_mcore_workers[i].cpuid = i; } if (sctp_cpuarry == NULL) { SCTP_MALLOC(sctp_cpuarry, int *, (mp_ncpus * sizeof(int)), SCTP_M_MCORE); i = 0; CPU_FOREACH(cpu) { sctp_cpuarry[i] = cpu; i++; } } /* Now start them all */ CPU_FOREACH(cpu) { (void)kproc_create(sctp_mcore_thread, (void *)&sctp_mcore_workers[cpu], &sctp_mcore_workers[cpu].thread_proc, 0, SCTP_KTHREAD_PAGES, SCTP_MCORE_NAME); } } #endif #define VALIDATE_LOADER_TUNABLE(var_name, prefix) \ if (SCTP_BASE_SYSCTL(var_name) < prefix##_MIN || \ SCTP_BASE_SYSCTL(var_name) > prefix##_MAX) \ SCTP_BASE_SYSCTL(var_name) = prefix##_DEFAULT void sctp_pcb_init(void) { /* * SCTP initialization for the PCB structures should be called by * the sctp_init() function. */ int i; struct timeval tv; if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) { /* error I was called twice */ return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 1; #if defined(SCTP_LOCAL_TRACE_BUF) memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); #endif #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_MALLOC(SCTP_BASE_STATS, struct sctpstat *, ((mp_maxid + 1) * sizeof(struct sctpstat)), SCTP_M_MCORE); #endif (void)SCTP_GETTIME_TIMEVAL(&tv); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) memset(SCTP_BASE_STATS, 0, sizeof(struct sctpstat) * (mp_maxid + 1)); SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t)tv.tv_usec; #else memset(&SCTP_BASE_STATS, 0, sizeof(struct sctpstat)); SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t)tv.tv_usec; #endif /* init the empty list of (All) Endpoints */ LIST_INIT(&SCTP_BASE_INFO(listhead)); /* init the hash table of endpoints */ TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale)); VALIDATE_LOADER_TUNABLE(sctp_hashtblsize, SCTPCTL_TCBHASHSIZE); VALIDATE_LOADER_TUNABLE(sctp_pcbtblsize, SCTPCTL_PCBHASHSIZE); VALIDATE_LOADER_TUNABLE(sctp_chunkscale, SCTPCTL_CHUNKSCALE); SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31), &SCTP_BASE_INFO(hashasocmark)); SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashmark)); SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashtcpmark)); SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize); SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH, &SCTP_BASE_INFO(hashvrfmark)); SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE, &SCTP_BASE_INFO(vrf_ifn_hashmark)); /* init the zones */ /* * FIX ME: Should check for NULL returns, but if it does fail we are * doomed to panic anyways... add later maybe. */ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep", sizeof(struct sctp_inpcb), maxsockets); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc", sizeof(struct sctp_tcb), sctp_max_number_of_assoc); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr", sizeof(struct sctp_laddr), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr", sizeof(struct sctp_nets), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk", sizeof(struct sctp_tmit_chunk), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq", sizeof(struct sctp_queued_to_read), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out", sizeof(struct sctp_stream_queue_pending), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf", sizeof(struct sctp_asconf), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack", sizeof(struct sctp_asconf_ack), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); /* Master Lock INIT for info structure */ SCTP_INP_INFO_LOCK_INIT(); SCTP_STATLOG_INIT_LOCK(); SCTP_IPI_COUNT_INIT(); SCTP_IPI_ADDR_INIT(); #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_INIT(); #endif LIST_INIT(&SCTP_BASE_INFO(addr_wq)); SCTP_WQ_ADDR_INIT(); /* not sure if we need all the counts */ SCTP_BASE_INFO(ipi_count_ep) = 0; /* assoc/tcb zone info */ SCTP_BASE_INFO(ipi_count_asoc) = 0; /* local addrlist zone info */ SCTP_BASE_INFO(ipi_count_laddr) = 0; /* remote addrlist zone info */ SCTP_BASE_INFO(ipi_count_raddr) = 0; /* chunk info */ SCTP_BASE_INFO(ipi_count_chunk) = 0; /* socket queue zone info */ SCTP_BASE_INFO(ipi_count_readq) = 0; /* stream out queue cont */ SCTP_BASE_INFO(ipi_count_strmoq) = 0; SCTP_BASE_INFO(ipi_free_strmoq) = 0; SCTP_BASE_INFO(ipi_free_chunks) = 0; SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer)); /* Init the TIMEWAIT list */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]); } sctp_startup_iterator(); #if defined(SCTP_MCORE_INPUT) && defined(SMP) sctp_startup_mcore_threads(); #endif /* * INIT the default VRF which for BSD is the only one, other O/S's * may have more. But initially they must start with one and then * add the VRF's as addresses are added. */ sctp_init_vrf_list(SCTP_DEFAULT_VRF); } /* * Assumes that the SCTP_BASE_INFO() lock is NOT held. */ void sctp_pcb_finish(void) { struct sctp_vrflist *vrf_bucket; struct sctp_vrf *vrf, *nvrf; struct sctp_ifn *ifn, *nifn; struct sctp_ifa *ifa, *nifa; struct sctpvtaghead *chain; struct sctp_tagblock *twait_block, *prev_twait_block; struct sctp_laddr *wi, *nwi; int i; struct sctp_iterator *it, *nit; if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: race condition on teardown.\n", __func__); return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 0; /* * In FreeBSD the iterator thread never exits but we do clean up. * The only way FreeBSD reaches here is if we have VRF's but we * still add the ifdef to make it compile on old versions. */ retry: SCTP_IPI_ITERATOR_WQ_LOCK(); /* * sctp_iterator_worker() might be working on an it entry without * holding the lock. We won't find it on the list either and * continue and free/destroy it. While holding the lock, spin, to * avoid the race condition as sctp_iterator_worker() will have to * wait to re-acquire the lock. */ if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. " "cur_it=%p\n", __func__, sctp_it_ctl.cur_it); DELAY(10); goto retry; } TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_ITERATOR_LOCK(); if ((sctp_it_ctl.cur_it) && (sctp_it_ctl.cur_it->vn == curvnet)) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } SCTP_ITERATOR_UNLOCK(); SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer)); SCTP_WQ_ADDR_LOCK(); LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) { LIST_REMOVE(wi, sctp_nxt_addr); SCTP_DECR_LADDR_COUNT(); if (wi->action == SCTP_DEL_IP_ADDRESS) { SCTP_FREE(wi->ifa, SCTP_M_IFA); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi); } SCTP_WQ_ADDR_UNLOCK(); /* * free the vrf/ifn/ifa lists and hashes (be sure address monitor is * destroyed first). */ SCTP_IPI_ADDR_WLOCK(); vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH_SAFE(vrf, vrf_bucket, next_vrf, nvrf) { LIST_FOREACH_SAFE(ifn, &vrf->ifnlist, next_ifn, nifn) { LIST_FOREACH_SAFE(ifa, &ifn->ifalist, next_ifa, nifa) { /* free the ifa */ LIST_REMOVE(ifa, next_bucket); LIST_REMOVE(ifa, next_ifa); SCTP_FREE(ifa, SCTP_M_IFA); } /* free the ifn */ LIST_REMOVE(ifn, next_bucket); LIST_REMOVE(ifn, next_ifn); SCTP_FREE(ifn, SCTP_M_IFN); } SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); /* free the vrf */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); } SCTP_IPI_ADDR_WUNLOCK(); /* free the vrf hashes */ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark)); SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark)); /* * free the TIMEWAIT list elements malloc'd in the function * sctp_add_vtag_to_timewait()... */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { chain = &SCTP_BASE_INFO(vtag_timewait)[i]; if (!LIST_EMPTY(chain)) { prev_twait_block = NULL; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { if (prev_twait_block) { SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } prev_twait_block = twait_block; } SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } } /* free the locks and mutexes */ #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_DESTROY(); #endif SCTP_IPI_ADDR_DESTROY(); SCTP_STATLOG_DESTROY(); SCTP_INP_INFO_LOCK_DESTROY(); SCTP_WQ_ADDR_DESTROY(); /* Get rid of other stuff too. */ if (SCTP_BASE_INFO(sctp_asochash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); if (SCTP_BASE_INFO(sctp_ephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack)); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE); #endif } int sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, int offset, int limit, struct sockaddr *src, struct sockaddr *dst, struct sockaddr *altsa, uint16_t port) { /* * grub through the INIT pulling addresses and loading them to the * nets structure in the asoc. The from address in the mbuf should * also be loaded (if it is not already). This routine can be called * with either INIT or INIT-ACK's as long as the m points to the IP * packet and the offset points to the beginning of the parameters. */ struct sctp_inpcb *inp; struct sctp_nets *net, *nnet, *net_tmp; struct sctp_paramhdr *phdr, param_buf; struct sctp_tcb *stcb_tmp; uint16_t ptype, plen; struct sockaddr *sa; uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_random *p_random = NULL; uint16_t random_len = 0; uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs = NULL; uint16_t hmacs_len = 0; uint8_t saw_asconf = 0; uint8_t saw_asconf_ack = 0; uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_chunk_list *chunks = NULL; uint16_t num_chunks = 0; sctp_key_t *new_key; uint32_t keylen; int got_random = 0, got_hmacs = 0, got_chklist = 0; uint8_t peer_supports_ecn; uint8_t peer_supports_prsctp; uint8_t peer_supports_auth; uint8_t peer_supports_asconf; uint8_t peer_supports_asconf_ack; uint8_t peer_supports_reconfig; uint8_t peer_supports_nrsack; uint8_t peer_supports_pktdrop; uint8_t peer_supports_idata; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* First get the destination address setup too. */ #ifdef INET memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = stcb->rport; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; #endif if (altsa) { sa = altsa; } else { sa = src; } peer_supports_idata = 0; peer_supports_ecn = 0; peer_supports_prsctp = 0; peer_supports_auth = 0; peer_supports_asconf = 0; peer_supports_asconf_ack = 0; peer_supports_reconfig = 0; peer_supports_nrsack = 0; peer_supports_pktdrop = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* mark all addresses that we have currently on the list */ net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC; } /* does the source address already exist? if so skip it */ inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* no scope set here since we have a tcb already. */ switch (sa->sa_family) { #ifdef INET case AF_INET: if (stcb->asoc.scope.ipv4_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { return (-1); } } break; #endif #ifdef INET6 case AF_INET6: if (stcb->asoc.scope.ipv6_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { return (-2); } } break; #endif default: break; } } else { if (net_tmp != NULL && stcb_tmp == stcb) { net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } else if (stcb_tmp != stcb) { /* It belongs to another association? */ if (stcb_tmp) SCTP_TCB_UNLOCK(stcb_tmp); return (-3); } } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-4); } /* now we must go through each of the params. */ phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); /* * SCTP_PRINTF("ptype => %0x, plen => %d\n", * (uint32_t)ptype, (int)plen); */ if (offset + plen > limit) { break; } if (plen < sizeof(struct sctp_paramhdr)) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS) { if (stcb->asoc.scope.ipv4_addr_legal) { struct sctp_ipv4addr_param *p4, p4_buf; /* ok get the v4 address and check/add */ phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); if (plen != sizeof(struct sctp_ipv4addr_param) || phdr == NULL) { return (-5); } p4 = (struct sctp_ipv4addr_param *)phdr; sin.sin_addr.s_addr = p4->addr; if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { /* Skip multi-cast addresses */ goto next_param; } if (in_broadcast(sin.sin_addr)) { goto next_param; } sa = (struct sockaddr *)&sin; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* * no scope set since we have a tcb * already */ /* * we must validate the state again * here */ add_it_now: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-7); } if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { return (-8); } } else if (stcb_tmp == stcb) { if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-10); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, false, SCTP_SO_NOT_LOCKED); goto add_it_now; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-12); } return (-13); } } } else #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS) { if (stcb->asoc.scope.ipv6_addr_legal) { /* ok get the v6 address and check/add */ struct sctp_ipv6addr_param *p6, p6_buf; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); if (plen != sizeof(struct sctp_ipv6addr_param) || phdr == NULL) { return (-14); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy((caddr_t)&sin6.sin6_addr, p6->addr, sizeof(p6->addr)); if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { /* Skip multi-cast addresses */ goto next_param; } if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { /* * Link local make no sense without * scope */ goto next_param; } sa = (struct sockaddr *)&sin6; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb_tmp == NULL && (inp == stcb->sctp_ep || inp == NULL)) { /* * we must validate the state again * here */ add_it_now6: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-16); } /* * we must add the address, no scope * set */ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { return (-17); } } else if (stcb_tmp == stcb) { /* * we must validate the state again * here */ if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-19); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, false, SCTP_SO_NOT_LOCKED); goto add_it_now6; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-21); } return (-22); } } } else #endif if (ptype == SCTP_ECN_CAPABLE) { peer_supports_ecn = 1; } else if (ptype == SCTP_ULP_ADAPTATION) { if (stcb->asoc.state != SCTP_STATE_OPEN) { struct sctp_adaptation_layer_indication ai, *aip; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ai, sizeof(ai)); aip = (struct sctp_adaptation_layer_indication *)phdr; if (aip) { stcb->asoc.peers_adaptation = ntohl(aip->indication); stcb->asoc.adaptation_needed = 1; } } } else if (ptype == SCTP_SET_PRIM_ADDR) { struct sctp_asconf_addr_param lstore, *fee; int lptype; struct sockaddr *lsa = NULL; #ifdef INET struct sctp_asconf_addrv4_param *fii; #endif if (stcb->asoc.asconf_supported == 0) { return (-100); } if (plen > sizeof(lstore)) { return (-23); } if (plen < sizeof(struct sctp_asconf_addrv4_param)) { return (-101); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&lstore, plen); if (phdr == NULL) { return (-24); } fee = (struct sctp_asconf_addr_param *)phdr; lptype = ntohs(fee->addrp.ph.param_type); switch (lptype) { #ifdef INET case SCTP_IPV4_ADDRESS: if (plen != sizeof(struct sctp_asconf_addrv4_param)) { SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addrv4_param), plen); } else { fii = (struct sctp_asconf_addrv4_param *)fee; sin.sin_addr.s_addr = fii->addrp.addr; lsa = (struct sockaddr *)&sin; } break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (plen != sizeof(struct sctp_asconf_addr_param)) { SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addr_param), plen); } else { memcpy(sin6.sin6_addr.s6_addr, fee->addrp.addr, sizeof(fee->addrp.addr)); lsa = (struct sockaddr *)&sin6; } break; #endif default: break; } if (lsa) { (void)sctp_set_primary_addr(stcb, sa, NULL); } } else if (ptype == SCTP_HAS_NAT_SUPPORT) { stcb->asoc.peer_supports_nat = 1; } else if (ptype == SCTP_PRSCTP_SUPPORTED) { /* Peer supports pr-sctp */ peer_supports_prsctp = 1; } else if (ptype == SCTP_ZERO_CHECKSUM_ACCEPTABLE) { struct sctp_zero_checksum_acceptable zero_chksum, *zero_chksum_p; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&zero_chksum, sizeof(struct sctp_zero_checksum_acceptable)); if (phdr != NULL) { /* * Only send zero checksums if the upper * layer has enabled the support for the * same method as allowed by the peer. */ zero_chksum_p = (struct sctp_zero_checksum_acceptable *)phdr; if ((ntohl(zero_chksum_p->edmid) != SCTP_EDMID_NONE) && (ntohl(zero_chksum_p->edmid) == stcb->asoc.rcv_edmid)) { stcb->asoc.snd_edmid = stcb->asoc.rcv_edmid; } } } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; int num_ent, i; if (plen > sizeof(local_store)) { return (-35); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&local_store, plen); if (phdr == NULL) { return (-25); } pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: peer_supports_asconf = 1; break; case SCTP_ASCONF_ACK: peer_supports_asconf_ack = 1; break; case SCTP_FORWARD_CUM_TSN: peer_supports_prsctp = 1; break; case SCTP_PACKET_DROPPED: peer_supports_pktdrop = 1; break; case SCTP_NR_SELECTIVE_ACK: peer_supports_nrsack = 1; break; case SCTP_STREAM_RESET: peer_supports_reconfig = 1; break; case SCTP_AUTHENTICATION: peer_supports_auth = 1; break; case SCTP_IDATA: peer_supports_idata = 1; break; default: /* one I have not learned yet */ break; } } } else if (ptype == SCTP_RANDOM) { if (plen > sizeof(random_store)) break; if (got_random) { /* already processed a RANDOM */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)random_store, plen); if (phdr == NULL) return (-26); p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); /* enforce the random length */ if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); return (-27); } got_random = 1; } else if (ptype == SCTP_HMAC_LIST) { uint16_t num_hmacs; uint16_t i; if (plen > sizeof(hmacs_store)) break; if (got_hmacs) { /* already processed a HMAC list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)hmacs_store, plen); if (phdr == NULL) return (-28); hmacs = (struct sctp_auth_hmac_algo *)phdr; hmacs_len = plen - sizeof(*hmacs); num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); /* validate the hmac list */ if (sctp_verify_hmac_param(hmacs, num_hmacs)) { return (-29); } if (stcb->asoc.peer_hmacs != NULL) sctp_free_hmaclist(stcb->asoc.peer_hmacs); stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs); if (stcb->asoc.peer_hmacs != NULL) { for (i = 0; i < num_hmacs; i++) { (void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs, ntohs(hmacs->hmac_ids[i])); } } got_hmacs = 1; } else if (ptype == SCTP_CHUNK_LIST) { int i; if (plen > sizeof(chunks_store)) break; if (got_chklist) { /* already processed a Chunks list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, plen); if (phdr == NULL) return (-30); chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); if (stcb->asoc.peer_auth_chunks != NULL) sctp_clear_chunklist(stcb->asoc.peer_auth_chunks); else stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist(); for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(chunks->chunk_types[i], stcb->asoc.peer_auth_chunks); /* record asconf/asconf-ack if listed */ if (chunks->chunk_types[i] == SCTP_ASCONF) saw_asconf = 1; if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) saw_asconf_ack = 1; } got_chklist = 1; } else if ((ptype == SCTP_HEARTBEAT_INFO) || (ptype == SCTP_STATE_COOKIE) || (ptype == SCTP_UNRECOG_PARAM) || (ptype == SCTP_COOKIE_PRESERVE) || (ptype == SCTP_SUPPORTED_ADDRTYPE) || (ptype == SCTP_ADD_IP_ADDRESS) || (ptype == SCTP_DEL_IP_ADDRESS) || (ptype == SCTP_ERROR_CAUSE_IND) || (ptype == SCTP_SUCCESS_REPORT)) { /* don't care */ } else { if ((ptype & 0x8000) == 0x0000) { /* * must stop processing the rest of the * param's. Any report bits were handled * with the call to * sctp_arethere_unrecognized_parameters() * when the INIT or INIT-ACK was first seen. */ break; } } next_param: offset += SCTP_SIZE32(plen); if (offset >= limit) { break; } phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } /* Now check to see if we need to purge any addresses */ TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) { if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) == SCTP_ADDR_NOT_IN_ASSOC) { /* This address has been removed from the asoc */ /* remove and free it */ stcb->asoc.numnets--; TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next); if (net == stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } if (net == stcb->asoc.primary_destination) { stcb->asoc.primary_destination = NULL; sctp_select_primary_destination(stcb); } sctp_free_remote_addr(net); } } if ((stcb->asoc.ecn_supported == 1) && (peer_supports_ecn == 0)) { stcb->asoc.ecn_supported = 0; } if ((stcb->asoc.prsctp_supported == 1) && (peer_supports_prsctp == 0)) { stcb->asoc.prsctp_supported = 0; } if ((stcb->asoc.auth_supported == 1) && ((peer_supports_auth == 0) || (got_random == 0) || (got_hmacs == 0))) { stcb->asoc.auth_supported = 0; } if ((stcb->asoc.asconf_supported == 1) && ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) || (stcb->asoc.auth_supported == 0) || (saw_asconf == 0) || (saw_asconf_ack == 0))) { stcb->asoc.asconf_supported = 0; } if ((stcb->asoc.reconfig_supported == 1) && (peer_supports_reconfig == 0)) { stcb->asoc.reconfig_supported = 0; } if ((stcb->asoc.idata_supported == 1) && (peer_supports_idata == 0)) { stcb->asoc.idata_supported = 0; } if ((stcb->asoc.nrsack_supported == 1) && (peer_supports_nrsack == 0)) { stcb->asoc.nrsack_supported = 0; } if ((stcb->asoc.pktdrop_supported == 1) && (peer_supports_pktdrop == 0)) { stcb->asoc.pktdrop_supported = 0; } /* validate authentication required parameters */ if ((peer_supports_auth == 0) && (got_chklist == 1)) { /* peer does not support auth but sent a chunks list? */ return (-31); } if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) { /* peer supports asconf but not auth? */ return (-32); } else if ((peer_supports_asconf == 1) && (peer_supports_auth == 1) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-33); } /* concatenate the full random key */ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; if (chunks != NULL) { keylen += sizeof(*chunks) + num_chunks; } new_key = sctp_alloc_key(keylen); if (new_key != NULL) { /* copy in the RANDOM */ if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); } else { keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { memcpy(new_key->key + keylen, chunks, sizeof(*chunks) + num_chunks); keylen += sizeof(*chunks) + num_chunks; } /* append in the HMACs */ if (hmacs != NULL) { memcpy(new_key->key + keylen, hmacs, sizeof(*hmacs) + hmacs_len); } } else { /* failed to get memory for the key */ return (-34); } if (stcb->asoc.authinfo.peer_random != NULL) sctp_free_key(stcb->asoc.authinfo.peer_random); stcb->asoc.authinfo.peer_random = new_key; sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); return (0); } int sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa, struct sctp_nets *net) { /* make sure the requested primary address exists in the assoc */ if (net == NULL && sa) net = sctp_findnet(stcb, sa); if (net == NULL) { /* didn't find the requested primary address! */ return (-1); } else { /* set the primary address */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* Must be confirmed, so queue to set */ net->dest_state |= SCTP_ADDR_REQ_PRIMARY; return (0); } stcb->asoc.primary_destination = net; if (((net->dest_state & SCTP_ADDR_PF) == 0) && (stcb->asoc.alternate != NULL)) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } net = TAILQ_FIRST(&stcb->asoc.nets); if (net != stcb->asoc.primary_destination) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficient if the * primary is the first on the list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } } bool sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now) { struct sctpasochead *head; struct sctp_tcb *stcb; SCTP_INP_INFO_LOCK_ASSERT(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { /* * We choose not to lock anything here. TCB's can't be * removed since we have the read lock, so they can't be * freed on us, same thing for the INP. I may be wrong with * this assumption, but we will go with it for now :-) */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (stcb->asoc.my_vtag == tag) { /* candidate */ if (stcb->rport != rport) { continue; } if (stcb->sctp_ep->sctp_lport != lport) { continue; } /* The tag is currently used, so don't use it. */ return (false); } } return (!sctp_is_in_timewait(tag, lport, rport, now->tv_sec)); } static void sctp_drain_mbufs(struct sctp_tcb *stcb) { /* * We must hunt this association for MBUF's past the cumack (i.e. * out of order data that we can renege on). */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *nchk; uint32_t cumulative_tsn_p1; struct sctp_queued_to_read *control, *ncontrol; int cnt, strmat; uint32_t gap, i; int fnd = 0; /* We look for anything larger than the cum-ack + 1 */ asoc = &stcb->asoc; if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) { /* none we can reneg on. */ return; } SCTP_STAT_INCR(sctps_protocol_drains_done); cumulative_tsn_p1 = asoc->cumulative_tsn + 1; cnt = 0; /* Ok that was fun, now we will drain all the inbound streams? */ for (strmat = 0; strmat < asoc->streamincnt; strmat++) { TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_ORDERED) { panic("Huh control: %p on_q: %d -- not ordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].uno_inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_UNORDERED) { panic("Huh control: %p on_q: %d -- not unordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } } if (cnt) { /* We must back down to see what the new highest is */ for (i = asoc->highest_tsn_inside_map; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) { SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { asoc->highest_tsn_inside_map = i; fnd = 1; break; } } if (!fnd) { asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; } /* * Question, should we go through the delivery queue? The * only reason things are on here is the app not reading OR * a p-d-api up. An attacker COULD send enough in to * initiate the PD-API and then send a bunch of stuff to * other streams... these would wind up on the delivery * queue.. and then we would not get to them. But in order * to do this I then have to back-track and un-deliver * sequence numbers in streams.. el-yucko. I think for now * we will NOT look at the delivery queue and leave it to be * something to consider later. An alternative would be to * abort the P-D-API with a notification and then deliver * the data.... Or another method might be to keep track of * how many times the situation occurs and if we see a * possible attack underway just abort the association. */ #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt); #endif /* * Now do we need to find a new * asoc->highest_tsn_inside_map? */ asoc->last_revoke_count = cnt; sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_11); /* sa_ignore NO_NULL_CHK */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED); } /* * Another issue, in un-setting the TSN's in the mapping array we * DID NOT adjust the highest_tsn marker. This will cause one of * two things to occur. It may cause us to do extra work in checking * for our mapping array movement. More importantly it may cause us * to SACK every datagram. This may not be a bad thing though since * we will recover once we get our cum-ack above and all this stuff * we dumped recovered. */ } static void -sctp_drain(void) +sctp_drain(void *arg __unused, int flags __unused) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); NET_EPOCH_ENTER(et); /* * We must walk the PCB lists for ALL associations here. The system * is LOW on MBUF's and needs help. This is where reneging will * occur. We really hope this does NOT happen! */ VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct sctp_inpcb *inp; struct sctp_tcb *stcb; SCTP_STAT_INCR(sctps_protocol_drain_calls); if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { #ifdef VIMAGE continue; #else NET_EPOCH_EXIT(et); return; #endif } SCTP_INP_INFO_RLOCK(); LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { /* For each endpoint */ SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { /* For each association */ SCTP_TCB_LOCK(stcb); sctp_drain_mbufs(stcb); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } SCTP_INP_INFO_RUNLOCK(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } EVENTHANDLER_DEFINE(vm_lowmem, sctp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_DEFINE(mbuf_lowmem, sctp_drain, NULL, LOWMEM_PRI_DEFAULT); /* * start a new iterator * iterates through all endpoints and associations based on the pcb_state * flags and asoc_state. "af" (mandatory) is executed for all matching * assocs and "ef" (optional) is executed when the iterator completes. * "inpf" (optional) is executed for each new endpoint as it is being * iterated through. inpe (optional) is called when the inp completes * its way through all the stcbs. */ int sctp_initiate_iterator(inp_func inpf, asoc_func af, inp_func inpe, uint32_t pcb_state, uint32_t pcb_features, uint32_t asoc_state, void *argp, uint32_t argi, end_func ef, struct sctp_inpcb *s_inp, uint8_t chunk_output_off) { struct sctp_iterator *it = NULL; if (af == NULL) { return (-1); } if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: abort on initialize being %d\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized)); return (-1); } SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator), SCTP_M_ITER); if (it == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); return (-1); } memset(it, 0, sizeof(*it)); it->function_assoc = af; it->function_inp = inpf; if (inpf) it->done_current_ep = 0; else it->done_current_ep = 1; it->function_atend = ef; it->pointer = argp; it->val = argi; it->pcb_flags = pcb_state; it->pcb_features = pcb_features; it->asoc_state = asoc_state; it->function_inp_end = inpe; it->no_chunk_output = chunk_output_off; it->vn = curvnet; if (s_inp) { /* Assume lock is held here */ it->inp = s_inp; SCTP_INP_INCR_REF(it->inp); it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP; } else { SCTP_INP_INFO_RLOCK(); it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead)); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } SCTP_INP_INFO_RUNLOCK(); it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP; } SCTP_IPI_ITERATOR_WQ_LOCK(); if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized), it); SCTP_FREE(it, SCTP_M_ITER); return (-1); } TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (sctp_it_ctl.iterator_running == 0) { sctp_wakeup_iterator(); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); /* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */ return (0); } /* * Atomically add flags to the sctp_flags of an inp. * To be used when the write lock of the inp is not held. */ void sctp_pcb_add_flags(struct sctp_inpcb *inp, uint32_t flags) { uint32_t old_flags, new_flags; do { old_flags = inp->sctp_flags; new_flags = old_flags | flags; } while (atomic_cmpset_int(&inp->sctp_flags, old_flags, new_flags) == 0); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 4baeb6f8f2d2..03efc759092d 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4778 +1,4778 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 static ip6proto_ctlinput_t tcp6_ctlinput; static udp_tun_icmp_t tcp6_ctlinput_viaudp; #endif VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif VNET_DEFINE(uint32_t, tcp_ack_war_time_window) = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_time_window), 0, "Time interval in ms used to limit the number (ack_war_cnt) of challenge ACKs sent per TCP connection"); VNET_DEFINE(uint32_t, tcp_ack_war_cnt) = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_cnt), 0, "Maximum number of challenge ACKs sent per TCP connection during the time interval (ack_war_timewindow)"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static counter_u64_t tcp_pacing_failures; static counter_u64_t tcp_dgp_failures; static uint32_t shadow_tcp_pacing_dgp = 0; static volatile uint32_t number_of_dgp_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); static int tcp_dgp_limit = -1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW, &tcp_dgp_limit, -1, "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD, &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif static int tcp_tcbhashsize = TCBHASHSIZE; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static void tcp_default_switch_failed(struct tcpcb *tp); static ipproto_ctlinput_t tcp_ctlinput; static udp_tun_icmp_t tcp_ctlinput_viaudp; static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, .tfb_switch_failed = tcp_default_switch_failed, .tfb_flags = TCP_FUNC_DEFAULT_OK, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; VNET_DEFINE_STATIC(struct tcp_function_block *, tcp_func_set_ptr) = &tcp_def_funcblk; #define V_tcp_func_set_ptr VNET(tcp_func_set_ptr) void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk = NULL; rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return (blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk = NULL; struct tcp_function *f; rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = V_tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; void *ptr = NULL; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ if (tfb->tfb_tcp_fb_init == NULL || (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { /* Release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* Now set in all the pointers */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; return; } /* * Initialization failed. Release the reference count on * the looked up default stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if ((*tfb->tfb_tcp_handoff_ok)(tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } if (tfb->tfb_tcp_fb_init != NULL && (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } /* Now release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* And set in the pointers to the new */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error = ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(V_tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return (error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } if ((blk->tfb_flags & TCP_FUNC_DEFAULT_OK) == 0) { error = EINVAL; goto done; } V_tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == V_tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static struct sx tcpoudp_lock; static void tcp_over_udp_stop(void) { sx_assert(&tcpoudp_lock, SA_XLOCKED); #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif sx_assert(&tcpoudp_lock, SA_XLOCKED); port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { sx_xlock(&tcpoudp_lock); V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); if (error != 0) { V_tcp_udp_tunneling_port = 0; } } sx_xunlock(&tcpoudp_lock); } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp, void **ptr) { struct socket *so = tptosocket(tp); int rexmt; INP_WLOCK_ASSERT(tptoinpcb(tp)); /* We don't use the pointer */ *ptr = NULL; KASSERT(tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; /* Validate the timers are not in usec, if they are convert */ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif tp->t_acktime = 0; return; } MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *f[TCP_FUNCTION_NAME_NUM_MAX]; struct tcp_function_set fs; int error, i, num_registered; KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(*num_names > 0, ("%s: Called with non-positive length of name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if (*num_names > TCP_FUNCTION_NAME_NUM_MAX) { /* Too many names. */ *num_names = 0; return (E2BIG); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (blk->tfb_tcp_handoff_ok == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* These functions are required and a name is needed. */ *num_names = 0; return (EINVAL); } for (i = 0; i < *num_names; i++) { f[i] = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (f[i] == NULL) { while (--i >= 0) free(f[i], M_TCPFUNCTIONS); *num_names = 0; return (ENOMEM); } } num_registered = 0; rw_wlock(&tcp_function_lock); if (find_tcp_fb_locked(blk, NULL) != NULL) { /* A TCP function block can only be registered once. */ error = EALREADY; goto cleanup; } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { error = EINVAL; goto cleanup; } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ error = EALREADY; goto cleanup; } f[i]->tf_fb = blk; (void)strlcpy(f[i]->tf_name, names[i], sizeof(f[i]->tf_name)); TAILQ_INSERT_TAIL(&t_functions, f[i], tf_next); tcp_fb_cnt++; num_registered++; } rw_wunlock(&tcp_function_lock); return (0); cleanup: /* Remove the entries just added. */ for (i = 0; i < *num_names; i++) { if (i < num_registered) { TAILQ_REMOVE(&t_functions, f[i], tf_next); tcp_fb_cnt--; } f[i]->tf_fb = NULL; free(f[i], M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); *num_names = num_registered; return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed), or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; VNET_ITERATOR_DECL(vnet_iter); if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (blk == V_tcp_func_set_ptr) { /* You can't free the current default in some vnet. */ CURVNET_RESTORE(); VNET_LIST_RUNLOCK_NOSLEEP(); rw_wunlock(&tcp_function_lock); return (EBUSY); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); while ((inp = inp_next(&inpi)) != NULL) { tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void -tcp_drain(void) +tcp_drain(void *ctx __unused, int flags __unused) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); static void tcp_init(void *arg __unused) { int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); sx_init(&tcpoudp_lock, "TCP over UDP configuration"); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif if (tcp_soreceive_stream) { #ifdef INET tcp_protosw.pr_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_protosw.pr_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); #else /* INET6 */ max_protohdr_grow(sizeof(struct tcpiphdr)); #endif /* INET6 */ ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); tcp_dgp_failures = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif hashsize = tcp_tcbhashsize; if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, "net.inet.tcp.tcbhashsize", hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { #ifdef TCP_HHOOK int error; #endif tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; int ect = 0; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tptoinpcb(tp); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* clear any receive flags for proper bpf timestamping */ m->m_flags &= ~(M_TSTMP | M_TSTMP_LRO); /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { ect = tcp_ecn_output_established(tp, &flags, 0, false); /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = htonl(ect << IPV6_FLOWLABEL_LEN); ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_len = htons(tlen); if (inp != NULL) { ip->ip_tos = inp->inp_ip_tos & ~IPTOS_ECN_MASK; ip->ip_ttl = inp->inp_ip_ttl; } else { ip->ip_tos = 0; ip->ip_ttl = V_ip_defttl; } ip->ip_tos |= ect; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp && (flags & TH_RST)) { /* Log the reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); } if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(inp, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && tcp_bblogging_on(tp)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, inp ? inp->in6p_outputopts : NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Send a challenge ack (no data, no SACK option), but not more than * V_tcp_ack_war_cnt per V_tcp_ack_war_time_window (per TCP connection). */ void tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m) { sbintime_t now; bool send_challenge_ack; if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) { /* ACK war protection is disabled. */ send_challenge_ack = true; } else { /* Start new epoch, if the previous one is already over. */ now = getsbinuptime(); if (tp->t_challenge_ack_end < now) { tp->t_challenge_ack_cnt = 0; tp->t_challenge_ack_end = now + V_tcp_ack_war_time_window * SBT_1MS; } /* * Send a challenge ACK, if less than tcp_ack_war_cnt have been * sent in the current epoch. */ if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) { send_challenge_ack = true; tp->t_challenge_ack_cnt++; } else { send_challenge_ack = false; } } if (send_challenge_ack) { tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } /* * Create a new TCP control block, making an empty reassembly queue and hooking * it to the argument protocol control block. The `inp' parameter must have * come from the zone allocator set up by tcpcbstor declaration. * The caller can provide a pointer to a tcpcb of the listener to inherit the * TCP function block from the listener. */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp, struct tcpcb *listening_tcb) { struct tcpcb *tp = intotcpcb(inp); #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ /* * Historically allocation was done with M_ZERO. There is a lot of * code that rely on that. For now take safe approach and zero whole * tcpcb. This definitely can be optimized. */ bzero(&tp->t_start_zero, t_zero_size); /* Initialise cc_var struct for this tcpcb. */ tp->t_ccv.tp = tp; rw_rlock(&tcp_function_lock); if (listening_tcb != NULL) { INP_LOCK_ASSERT(tptoinpcb(listening_tcb)); KASSERT(listening_tcb->t_fb != NULL, ("tcp_newtcpcb: listening_tcb->t_fb is NULL")); if (listening_tcb->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) { rw_runlock(&tcp_function_lock); return (NULL); } tp->t_fb = listening_tcb->t_fb; } else { tp->t_fb = V_tcp_func_set_ptr; } refcount_acquire(&tp->t_fb->tfb_refcnt); KASSERT((tp->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) == 0, ("tcp_newtcpcb: using TFB being removed")); rw_runlock(&tcp_function_lock); CC_LIST_RLOCK(); if (listening_tcb != NULL) { if (CC_ALGO(listening_tcb)->flags & CC_MODULE_BEING_REMOVED) { CC_LIST_RUNLOCK(); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } CC_ALGO(tp) = CC_ALGO(listening_tcb); } else CC_ALGO(tp) = CC_DEFAULT_ALGO(); cc_refer(CC_ALGO(tp)); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #ifdef TCP_HHOOK if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #endif TAILQ_INIT(&tp->t_segq); STAILQ_INIT(&tp->t_inqueue); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); tp->t_hpts_cpu = HPTS_CPU_NONE; tp->t_lro_cpu = HPTS_CPU_NONE; callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_TRYLOCK | CALLOUT_RETURNUNLOCKED); for (int i = 0; i < TT_N; i++) tp->t_timers[i] = SBT_MAX; switch (V_tcp_do_rfc1323) { case 0: break; default: case 1: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); break; case 2: tp->t_flags = TF_REQ_SCALE; break; case 3: tp->t_flags = TF_REQ_TSTMP; break; } if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* We always start with ticks granularity */ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); struct mbuf *m; #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); MPASS(!callout_active(&tp->t_callout)); MPASS(TAILQ_EMPTY(&tp->snd_holes)); /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) { struct mbuf *prev; STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, prev) m_freem(m); } TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); MPASS(!tcp_in_hpts(tp)); #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.hc_ssthresh = ssthresh; metrics.hc_rtt = tp->t_srtt; metrics.hc_rttvar = tp->t_rttvar; metrics.hc_cwnd = tp->snd_cwnd; metrics.hc_sendpipe = 0; metrics.hc_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } tcp_timer_stop(tp); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); tcp_free_sackholes(tp); soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) { struct ip *ip; struct tcphdr *th; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int); struct in_conninfo inc; tcp_seq icmp_tcp_seq; int errno, mtu; errno = icmp_errmap(icp); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } ip = &icp->icmp_ip; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); icmp_tcp_seq = th->th_seq; inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = ip->ip_dst; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = ip->ip_dst; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp_ctlinput(struct icmp *icmp) { tcp_ctlinput_with_port(icmp, htons(0)); } static void tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) { /* Its a tunneled TCP over UDP icmp */ struct icmp *icmp = param.icmp; struct ip *outer_ip, *inner_ip; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); inner_ip = &icmp->icmp_ip; i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(icmp, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int); struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; int errno; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; dst = &ip6cp->ip6c_finaldst->sin6_addr; errno = icmp6_errmap(icmp6); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: /* * There are only four ICMPs that may reset connection: * - administratively prohibited * - port unreachable * - time exceeded in transit * - unknown next header */ if (V_icmp_may_rst && ((icmp6->icmp6_type == ICMP6_DST_UNREACH && (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || (icmp6->icmp6_type == ICMP6_PARAM_PROB && icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp6_ctlinput(struct ip6ctlparam *ctl) { tcp6_ctlinput_with_port(ctl, htons(0)); } static void tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) { struct ip6ctlparam *ip6cp = param.ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(ip6cp, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ static struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (tp->t_flags & TF_FASTOPEN) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCK_SENDBUF_LOCK(so); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) { tp->t_maxseg = so->so_snd.sb_hiwat; if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } } SOCK_SENDBUF_UNLOCK(so); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; /* XXXKIB IFCAP2_IPSEC_OFFLOAD_TSO */ cap->ipsec_tso = (ifp->if_capenable2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) != 0; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; cap->ipsec_tso = false; /* XXXKIB */ } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) { tp->t_maxseg = TCP6_MSS; if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } } } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (!SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; bzero(xt, sizeof(*xt)); xt->t_state = tp->t_state; xt->t_logstate = tcp_get_bblog_state(tp); xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(which,where) do { \ if (tp->t_timers[which] != SBT_MAX) \ xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ else \ xt->where = 0; \ } while (0) COPYTIMER(TT_DELACK, tt_delack); COPYTIMER(TT_REXMT, tt_rexmt); COPYTIMER(TT_PERSIST, tt_persist); COPYTIMER(TT_KEEP, tt_keep); COPYTIMER(TT_2MSL, tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { counter_u64_add(tcp_pacing_failures, 1); return (0); } } int tcp_incr_dgp_pacing_cnt(void) { if ((tcp_dgp_limit == -1) || (tcp_dgp_limit > number_of_dgp_connections)) { atomic_fetchadd_int(&number_of_dgp_connections, 1); shadow_tcp_pacing_dgp = number_of_dgp_connections; return (1); } else { counter_u64_add(tcp_dgp_failures, 1); return (0); } } static uint8_t tcp_dgp_warning = 0; void tcp_dec_dgp_pacing_cnt(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_dgp_connections, -1); shadow_tcp_pacing_dgp = number_of_dgp_connections; KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?")); if (ret == 0) { if (tcp_dgp_limit != -1) { printf("Warning all DGP is now disabled, count decrements invalidly!\n"); tcp_dgp_limit = 0; tcp_dgp_warning = 1; } else if (tcp_dgp_warning == 0) { printf("Warning DGP pacing is invalid, invalid decrement\n"); tcp_dgp_warning = 1; } } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } static void tcp_default_switch_failed(struct tcpcb *tp) { /* * If a switch fails we only need to * care about two things: * a) The t_flags2 * and * b) The timer granularity. * Timeouts, at least for now, don't use the * old callout system in the other stacks so * those are hopefully safe. */ tcp_lro_features_off(tp); tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); } #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss) { if (SEQ_LT(th->th_ack, tp->snd_una)) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_BEHIND]++; } return (ACK_BEHIND); } } else if (th->th_ack == tp->snd_una) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else if (tiwin != tp->snd_wnd) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_RWND]++; } return (ACK_RWND); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_DUPACK]++; } return (ACK_DUPACK); } } else { if (!SEQ_GT(th->th_ack, tp->snd_max)) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((th->th_ack - tp->snd_una) + mss - 1)/mss); } } if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK_SACK]++; } return (ACK_CUMACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK]++; } return (ACK_CUMACK); } } } #endif void tcp_change_time_units(struct tcpcb *tp, int granularity) { if (tp->t_tmr_granularity == granularity) { /* We are there */ return; } if (granularity == TCP_TMR_GRANULARITY_USEC) { KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), ("Granularity is not TICKS its %u in tp:%p", tp->t_tmr_granularity, tp)); tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); if (tp->t_srtt > 1) { uint32_t val, frac; val = tp->t_srtt >> TCP_RTT_SHIFT; frac = tp->t_srtt & 0x1f; tp->t_srtt = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; frac = tp->t_rttvar & 0x1f; tp->t_rttvar = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_rttvar += frac; } } tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { /* Convert back to ticks, with */ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), ("Granularity is not USEC its %u in tp:%p", tp->t_tmr_granularity, tp)); if (tp->t_srtt > 1) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_srtt); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_srtt = val << TCP_RTT_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_rttvar); frac = tp->t_rttvar % (HPTS_USEC_IN_SEC / hz); tp->t_rttvar = val << TCP_RTTVAR_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 4 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTTVAR_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTTVAR_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_rttvar += frac; } } tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; } #ifdef INVARIANTS else { panic("Unknown granularity:%d tp:%p", granularity, tp); } #endif } void tcp_handle_orphaned_packets(struct tcpcb *tp) { struct mbuf *save, *m, *prev; /* * Called when a stack switch is occuring from the fini() * of the old stack. We assue the init() as already been * run of the new stack and it has set the t_flags2 to * what it supports. This function will then deal with any * differences i.e. cleanup packets that maybe queued that * the newstack does not support. */ if (tp->t_flags2 & TF2_MBUF_L_ACKS) return; if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) == 0 && !STAILQ_EMPTY(&tp->t_inqueue)) { /* * It is unsafe to process the packets since a * reset may be lurking in them (its rare but it * can occur). If we were to find a RST, then we * would end up dropping the connection and the * INP lock, so when we return the caller (tcp_usrreq) * will blow up when it trys to unlock the inp. * This new stack does not do any fancy LRO features * so all we can do is toss the packets. */ m = STAILQ_FIRST(&tp->t_inqueue); STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) m_freem(m); } else { /* * Here we have a stack that does mbuf queuing but * does not support compressed ack's. We must * walk all the mbufs and discard any compressed acks. */ STAILQ_FOREACH_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) { if (m->m_flags & M_ACKCMP) { if (m == STAILQ_FIRST(&tp->t_inqueue)) STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); else STAILQ_REMOVE_AFTER(&tp->t_inqueue, prev, m_stailqpkt); m_freem(m); } else prev = m; } } } #ifdef TCP_REQUEST_TRK uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) { #ifdef KERN_TLS struct ktls_session *tls; uint32_t rec_oh, records; tls = so->so_snd.sb_tls_info; if (tls == NULL) return (0); rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); return (records * rec_oh); #else return (0); #endif } extern uint32_t tcp_stale_entry_time; uint32_t tcp_stale_entry_time = 250000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, &tcp_stale_entry_time, 250000, "Time that a tcpreq entry without a sendfile ages out"); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) { if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = val; log.u_bbr.rttProp = req->timestamp; log.u_bbr.delRate = req->start; log.u_bbr.cur_del_rate = req->end; log.u_bbr.flex1 = req->start_seq; log.u_bbr.flex2 = req->end_seq; log.u_bbr.flex3 = req->flags; log.u_bbr.flex4 = ((req->localtime >> 32) & 0x00000000ffffffff); log.u_bbr.flex5 = (req->localtime & 0x00000000ffffffff); log.u_bbr.flex7 = slot; log.u_bbr.bw_inuse = offset; /* nbytes = flex6 | epoch */ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); /* cspr = lt_epoch | pkts_out */ log.u_bbr.lt_epoch = ((req->cspr >> 32) & 0x00000000ffffffff); log.u_bbr.pkts_out |= (req->cspr & 0x00000000ffffffff); log.u_bbr.applimited = tp->t_tcpreq_closed; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_open; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_req; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_REQ_T, 0, 0, &log, false, &tv); } } void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent) { if (tp->t_tcpreq_req > 0) tp->t_tcpreq_req--; if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { if (tp->t_tcpreq_open > 0) tp->t_tcpreq_open--; } else { if (tp->t_tcpreq_closed > 0) tp->t_tcpreq_closed--; } ent->flags = TCP_TRK_TRACK_FLG_EMPTY; } static void tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) { struct tcp_sendfile_track *ent; uint64_t time_delta, oldest_delta; int i, oldest, oldest_set = 0, cnt_rm = 0; for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags != TCP_TRK_TRACK_FLG_USED) { /* * We only care about closed end ranges * that are allocated and have no sendfile * ever touching them. They would be in * state USED. */ continue; } if (ts >= ent->localtime) time_delta = ts - ent->localtime; else time_delta = 0; if (time_delta && ((oldest_delta < time_delta) || (oldest_set == 0))) { oldest_set = 1; oldest = i; oldest_delta = time_delta; } if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { /* * No sendfile in a our time-limit * time to purge it. */ cnt_rm++; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, time_delta, 0); tcp_req_free_a_slot(tp, ent); } } if ((cnt_rm == 0) && rm_oldest && oldest_set) { ent = &tp->t_tcpreq_info[oldest]; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, oldest_delta, 1); tcp_req_free_a_slot(tp, ent); } } int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) { int i, ret = 0; struct tcp_sendfile_track *ent; /* Clean up any old closed end requests that are now completed */ if (tp->t_tcpreq_req == 0) return (0); if (tp->t_tcpreq_closed == 0) return (0); for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; /* Skip empty ones */ if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; /* Skip open ones */ if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) continue; if (SEQ_GEQ(ack_point, ent->end_seq)) { /* We are past it -- free it */ tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_FREED, 0, 0); tcp_req_free_a_slot(tp, ent); ret++; } } return (ret); } int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point) { if (tp->t_tcpreq_req == 0) return (-1); if (tp->t_tcpreq_closed == 0) return (-1); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) return (-1); if (SEQ_GEQ(ack_point, ent->end_seq)) { return (1); } return (0); } struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) { /* * Given an ack point (th_ack) walk through our entries and * return the first one found that th_ack goes past the * end_seq. */ struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0) { if (SEQ_GEQ(th_ack, ent->end_seq)) { *ip = i; return (ent); } } } return (NULL); } struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) { struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_SEARCH, (uint64_t)seq, 0); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) { continue; } if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { /* * An open end request only needs to * match the beginning seq or be * all we have (once we keep going on * a open end request we may have a seq * wrap). */ if ((SEQ_GEQ(seq, ent->start_seq)) || (tp->t_tcpreq_closed == 0)) return (ent); } else { /* * For this one we need to * be a bit more careful if its * completed at least. */ if ((SEQ_GEQ(seq, ent->start_seq)) && (SEQ_LT(seq, ent->end_seq))) { return (ent); } } } return (NULL); } /* Should this be in its own file tcp_req.c ? */ struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups) { struct tcp_sendfile_track *fil; int i, allocated; /* In case the stack does not check for completions do so now */ tcp_req_check_for_comp(tp, tp->snd_una); /* Check for stale entries */ if (tp->t_tcpreq_req) tcp_req_check_for_stale_entries(tp, ts, (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ)); /* Check to see if this is a duplicate of one not started */ if (tp->t_tcpreq_req) { for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) continue; if ((fil->timestamp == req->timestamp) && (fil->start == req->start) && ((fil->flags & TCP_TRK_TRACK_FLG_OPEN) || (fil->end == req->end))) { /* * We already have this request * and it has not been started with sendfile. * This probably means the user was returned * a 4xx of some sort and its going to age * out, lets not duplicate it. */ return (fil); } } } /* Ok if there is no room at the inn we are in trouble */ if (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ) { tcp_trace_point(tp, TCP_TP_REQ_LOG_FAIL); for (i = 0; i < MAX_TCP_TRK_REQ; i++) { tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_ALLOCFAIL, 0, 0); } return (NULL); } for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if (fil->flags == TCP_TRK_TRACK_FLG_EMPTY) { allocated = 1; fil->flags = TCP_TRK_TRACK_FLG_USED; fil->timestamp = req->timestamp; fil->playout_ms = req->playout_ms; fil->localtime = ts; fil->start = req->start; if (req->flags & TCP_LOG_HTTPD_RANGE_END) { fil->end = req->end; } else { fil->end = 0; fil->flags |= TCP_TRK_TRACK_FLG_OPEN; } /* * We can set the min boundaries to the TCP Sequence space, * but it might be found to be further up when sendfile * actually runs on this range (if it ever does). */ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; fil->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; if (req->flags & TCP_LOG_HTTPD_RANGE_END) fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); else fil->end_seq = 0; if (tptosocket(tp)->so_snd.sb_tls_info) { /* * This session is doing TLS. Take a swag guess * at the overhead. */ fil->end_seq += tcp_estimate_tls_overhead( tptosocket(tp), (fil->end - fil->start)); } tp->t_tcpreq_req++; if (fil->flags & TCP_TRK_TRACK_FLG_OPEN) tp->t_tcpreq_open++; else tp->t_tcpreq_closed++; tcp_req_log_req_info(tp, fil, i, TCP_TRK_REQ_LOG_NEW, 0, 0); break; } else fil = NULL; } return (fil); } void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) { (void)tcp_req_alloc_req_full(tp, &user->tcp_req, ts, 1); } #endif void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err) { if (tcp_bblogging_on(tp)) { struct tcp_log_buffer *l; l = tcp_log_event(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_SOCKET_OPT, err, 0, NULL, 1, NULL, NULL, 0, NULL); if (l) { l->tlb_flex1 = option_num; l->tlb_flex2 = option_val; } } } uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity) { uint32_t srtt; KASSERT(granularity == TCP_TMR_GRANULARITY_USEC || granularity == TCP_TMR_GRANULARITY_TICKS, ("%s: called with unexpected granularity %d", __func__, granularity)); srtt = tp->t_srtt; /* * We only support two granularities. If the stored granularity * does not match the granularity requested by the caller, * convert the stored value to the requested unit of granularity. */ if (tp->t_tmr_granularity != granularity) { if (granularity == TCP_TMR_GRANULARITY_USEC) srtt = TICKS_2_USEC(srtt); else srtt = USEC_2_TICKS(srtt); } /* * If the srtt is stored with ticks granularity, we need to * unshift to get the actual value. We do this after the * conversion above (if one was necessary) in order to maximize * precision. */ if (tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS) srtt = srtt >> TCP_RTT_SHIFT; return (srtt); } void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp, bool hw_tls) { if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; #ifdef KERN_TLS if (hw_tls && is_rxt && len != 0) { uint64_t rexmit_percent; rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); if (rexmit_percent > ktls_ifnet_max_rexmit_pct) ktls_disable_ifnet(tp); } #endif } diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c index e976298bf984..f6bb6e94ed08 100644 --- a/sys/netinet6/frag6.c +++ b/sys/netinet6/frag6.c @@ -1,1112 +1,1112 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * Copyright (c) 2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $ */ #include #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* For ECN definitions. */ #include /* For ECN definitions. */ #ifdef MAC #include #endif /* * A "big picture" of how IPv6 fragment queues are all linked together. * * struct ip6qbucket ip6qb[...]; hashed buckets * |||||||| * | * +--- TAILQ(struct ip6q, packets) *q6; tailq entries holding * |||||||| fragmented packets * | (1 per original packet) * | * +--- TAILQ(struct ip6asfrag, ip6q_frags) *af6; tailq entries of IPv6 * | *ip6af;fragment packets * | for one original packet * + *mbuf */ /* Reassembly headers are stored in hash buckets. */ #define IP6REASS_NHASH_LOG2 10 #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #define IP6REASS_HMASK (IP6REASS_NHASH - 1) TAILQ_HEAD(ip6qhead, ip6q); struct ip6qbucket { struct ip6qhead packets; struct mtx lock; int count; }; struct ip6asfrag { TAILQ_ENTRY(ip6asfrag) ip6af_tq; struct mbuf *ip6af_m; int ip6af_offset; /* Offset in ip6af_m to next header. */ int ip6af_frglen; /* Fragmentable part length. */ int ip6af_off; /* Fragment offset. */ bool ip6af_mff; /* More fragment bit in frag off. */ }; static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header"); #ifdef VIMAGE /* A flag to indicate if IPv6 fragmentation is initialized. */ VNET_DEFINE_STATIC(bool, frag6_on); #define V_frag6_on VNET(frag6_on) #endif /* System wide (global) maximum and count of packets in reassembly queues. */ static int ip6_maxfrags; static u_int __exclusive_cache_line frag6_nfrags; /* Maximum and current packets in per-VNET reassembly queue. */ VNET_DEFINE_STATIC(int, ip6_maxfragpackets); VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_frag6_nfragpackets VNET(frag6_nfragpackets) /* Maximum per-VNET reassembly timeout (milliseconds) */ VNET_DEFINE_STATIC(u_int, ip6_fraglifetime) = IPV6_DEFFRAGTTL; #define V_ip6_fraglifetime VNET(ip6_fraglifetime) /* Maximum per-VNET reassembly queues per bucket and fragments per packet. */ VNET_DEFINE_STATIC(int, ip6_maxfragbucketsize); VNET_DEFINE_STATIC(int, ip6_maxfragsperpacket); #define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) #define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) /* Per-VNET reassembly queue buckets. */ VNET_DEFINE_STATIC(struct ip6qbucket, ip6qb[IP6REASS_NHASH]); VNET_DEFINE_STATIC(uint32_t, ip6qb_hashseed); #define V_ip6qb VNET(ip6qb) #define V_ip6qb_hashseed VNET(ip6qb_hashseed) #define IP6QB_LOCK(_b) mtx_lock(&V_ip6qb[(_b)].lock) #define IP6QB_TRYLOCK(_b) mtx_trylock(&V_ip6qb[(_b)].lock) #define IP6QB_LOCK_ASSERT(_b) mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED) #define IP6QB_UNLOCK(_b) mtx_unlock(&V_ip6qb[(_b)].lock) #define IP6QB_HEAD(_b) (&V_ip6qb[(_b)].packets) /* * By default, limit the number of IP6 fragments across all reassembly * queues to 1/32 of the total number of mbuf clusters. * * Limit the total number of reassembly queues per VNET to the * IP6 fragment limit, but ensure the limit will not allow any bucket * to grow above 100 items. (The bucket limit is * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct * multiplier to reach a 100-item limit.) * The 100-item limit was chosen as brief testing seems to show that * this produces "reasonable" performance on some subset of systems * under DoS attack. */ #define IP6_MAXFRAGS (nmbclusters / 32) #define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) /* Interval between periodic reassembly queue inspections */ #define IP6_CALLOUT_INTERVAL_MS 500 /* * Sysctls and helper function. */ SYSCTL_DECL(_net_inet6_ip6); SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfrags, CTLFLAG_RD, &frag6_nfrags, 0, "Global number of IPv6 fragments across all reassembly queues."); static void frag6_set_bucketsize(void) { int i; if ((i = V_ip6_maxfragpackets) > 0) V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, "Maximum allowed number of outstanding IPv6 packet fragments. " "A value of 0 means no fragmented packets will be accepted, while " "a value of -1 means no limit"); static int sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_maxfragpackets; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); V_ip6_maxfragpackets = val; frag6_set_bucketsize(); return (0); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_maxfragpackets, "I", "Default maximum number of outstanding fragmented IPv6 packets. " "A value of 0 means no fragmented packets will be accepted, while a " "a value of -1 means no limit"); SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfragpackets, CTLFLAG_VNET | CTLFLAG_RD, __DEVOLATILE(u_int *, &VNET_NAME(frag6_nfragpackets)), 0, "Per-VNET number of IPv6 fragments across all reassembly queues."); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, "Maximum allowed number of fragments per packet"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, "Maximum number of reassembly queues per hash bucket"); static int frag6_milli_to_callout_ticks(int ms) { return (ms / IP6_CALLOUT_INTERVAL_MS); } static int frag6_callout_ticks_to_milli(int ms) { return (ms * IP6_CALLOUT_INTERVAL_MS); } _Static_assert(sizeof(((struct ip6q *)NULL)->ip6q_ttl) >= 2, "ip6q_ttl field is not large enough"); static int sysctl_ip6_fraglifetime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_fraglifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val <= 0) val = IPV6_DEFFRAGTTL; if (frag6_milli_to_callout_ticks(val) >= 65536) val = frag6_callout_ticks_to_milli(65535); #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { CURVNET_SET(vnet0); int host_val = V_ip6_fraglifetime; CURVNET_RESTORE(); if (val > host_val) val = host_val; } #endif V_ip6_fraglifetime = val; return (0); } SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, fraglifetime_ms, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_fraglifetime, "I", "Fragment lifetime, in milliseconds"); /* * Remove the IPv6 fragmentation header from the mbuf. */ int ip6_deletefraghdr(struct mbuf *m, int offset, int wait __unused) { struct ip6_hdr *ip6; KASSERT(m->m_len >= offset + sizeof(struct ip6_frag), ("%s: ext headers not contigous in mbuf %p m_len %d >= " "offset %d + %zu\n", __func__, m, m->m_len, offset, sizeof(struct ip6_frag))); /* Delete frag6 header. */ ip6 = mtod(m, struct ip6_hdr *); bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), offset); m->m_data += sizeof(struct ip6_frag); m->m_len -= sizeof(struct ip6_frag); m->m_flags |= M_FRAGMENTED; return (0); } static void frag6_rmqueue(struct ip6q *q6, uint32_t bucket) { IP6QB_LOCK_ASSERT(bucket); TAILQ_REMOVE(IP6QB_HEAD(bucket), q6, ip6q_tq); V_ip6qb[bucket].count--; #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); } /* * Free a fragment reassembly header and all associated datagrams. */ static void frag6_freef(struct ip6q *q6, uint32_t bucket) { struct ip6_hdr *ip6; struct ip6asfrag *af6; struct mbuf *m; IP6QB_LOCK_ASSERT(bucket); while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { m = af6->ip6af_m; TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); /* * Return ICMP time exceeded error for the 1st fragment. * Just free other fragments. */ if (af6->ip6af_off == 0 && m->m_pkthdr.rcvif != NULL) { /* Adjust pointer. */ ip6 = mtod(m, struct ip6_hdr *); /* Restore source and destination addresses. */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_REASSEMBLY, 0); } else m_freem(m); free(af6, M_FRAG6); } atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); frag6_rmqueue(q6, bucket); } /* * Drain off all datagram fragments belonging to * the given network interface. */ static void frag6_cleanup(void *arg __unused, struct ifnet *ifp) { struct ip6qhead *head; struct ip6q *q6; struct ip6asfrag *af6; uint32_t bucket; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); CURVNET_SET_QUIET(ifp->if_vnet); #ifdef VIMAGE /* * Skip processing if IPv6 reassembly is not initialised or * torn down by frag6_destroy(). */ if (!V_frag6_on) { CURVNET_RESTORE(); return; } #endif for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); /* Scan fragment list. */ TAILQ_FOREACH(q6, head, ip6q_tq) { TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { /* Clear no longer valid rcvif pointer. */ if (af6->ip6af_m->m_pkthdr.rcvif == ifp) af6->ip6af_m->m_pkthdr.rcvif = NULL; } } IP6QB_UNLOCK(bucket); } CURVNET_RESTORE(); } EVENTHANDLER_DEFINE(ifnet_departure_event, frag6_cleanup, NULL, 0); /* * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with * each other, in terms of next header field handling in fragment header. * While the sender will use the same value for all of the fragmented packets, * receiver is suggested not to check for consistency. * * Fragment rules (p18,p19): * (2) A Fragment header containing: * The Next Header value that identifies the first header * after the Per-Fragment headers of the original packet. * -> next header field is same for all fragments * * Reassembly rule (p20): * The Next Header field of the last header of the Per-Fragment * headers is obtained from the Next Header field of the first * fragment's Fragment header. * -> should grab it from the first fragment only * * The following note also contradicts with fragment rule - no one is going to * send different fragment with different next header field. * * Additional note (p22) [not an error]: * The Next Header values in the Fragment headers of different * fragments of the same original packet may differ. Only the value * from the Offset zero fragment packet is used for reassembly. * -> should grab it from the first fragment only * * There is no explicit reason given in the RFC. Historical reason maybe? */ /* * Fragment input. */ int frag6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; struct ip6qhead *head; struct ip6q *q6; struct ip6asfrag *af6, *ip6af, *af6tmp; struct in6_ifaddr *ia6; struct ifnet *dstifp, *srcifp; uint32_t hashkey[(sizeof(struct in6_addr) * 2 + sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)]; uint32_t bucket, *hashkeyp; int fragoff, frgpartlen; /* Must be larger than uint16_t. */ int nxt, offset, plen; uint8_t ecn, ecn0; bool only_frag; #ifdef RSS struct ip6_direct_ctx *ip6dc; struct m_tag *mtag; #endif m = *mp; offset = *offp; M_ASSERTPKTHDR(m); if (m->m_len < offset + sizeof(struct ip6_frag)) { m = m_pullup(m, offset + sizeof(struct ip6_frag)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); dstifp = NULL; /* Find the destination interface of the packet. */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL) dstifp = ia6->ia_ifp; /* Jumbo payload cannot contain a fragment header. */ if (ip6->ip6_plen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); *mp = NULL; return (IPPROTO_DONE); } /* * Check whether fragment packet's fragment length is a * multiple of 8 octets (unless it is the last one). * sizeof(struct ip6_frag) == 8 * sizeof(struct ip6_hdr) = 40 */ ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) && (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); *mp = NULL; return (IPPROTO_DONE); } IP6STAT_INC(ip6s_fragments); in6_ifstat_inc(dstifp, ifs6_reass_reqd); /* * Handle "atomic" fragments (offset and m bit set to 0) upfront, * unrelated to any reassembly. We need to remove the frag hdr * which is ugly. * See RFC 6946 and section 4.5 of RFC 8200. */ if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { IP6STAT_INC(ip6s_atomicfrags); nxt = ip6f->ip6f_nxt; /* * Set nxt(-hdr field value) to the original value. * We cannot just set ip6->ip6_nxt as there might be * an unfragmentable part with extension headers and * we must update the last one. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct ip6_frag)); if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) goto dropfrag2; m->m_pkthdr.len -= sizeof(struct ip6_frag); in6_ifstat_inc(dstifp, ifs6_reass_ok); *mp = m; return (nxt); } /* Offset now points to data portion. */ offset += sizeof(struct ip6_frag); /* Get fragment length and discard 0-byte fragments. */ frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (frgpartlen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); *mp = NULL; return (IPPROTO_DONE); } /* * Enforce upper bound on number of fragments for the entire system. * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ if (ip6_maxfrags < 0) ; else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags) goto dropfrag2; /* * Validate that a full header chain to the ULP is present in the * packet containing the first fragment as per RFC RFC7112 and * RFC 8200 pages 18,19: * The first fragment packet is composed of: * (3) Extension headers, if any, and the Upper-Layer header. These * headers must be in the first fragment. ... */ fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); /* XXX TODO. thj has D16851 open for this. */ /* Send ICMPv6 4,3 in case of violation. */ /* Store receive network interface pointer for later. */ srcifp = m->m_pkthdr.rcvif; /* Generate a hash value for fragment bucket selection. */ hashkeyp = hashkey; memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); *hashkeyp = ip6f->ip6f_ident; bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed); bucket &= IP6REASS_HMASK; IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); TAILQ_FOREACH(q6, head, ip6q_tq) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) #ifdef MAC && mac_ip6q_match(m, q6) #endif ) break; only_frag = false; if (q6 == NULL) { /* A first fragment to arrive creates a reassembly queue. */ only_frag = true; /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly; * If maxfragpackets is 0, never accept fragments. * If maxfragpackets is -1, accept all fragments without * limitation. */ if (V_ip6_maxfragpackets < 0) ; else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize || atomic_load_int(&V_frag6_nfragpackets) >= (u_int)V_ip6_maxfragpackets) goto dropfrag; /* Allocate IPv6 fragement packet queue entry. */ q6 = malloc(sizeof(struct ip6q), M_FRAG6, M_NOWAIT | M_ZERO); if (q6 == NULL) goto dropfrag; #ifdef MAC if (mac_ip6q_init(q6, M_NOWAIT) != 0) { free(q6, M_FRAG6); goto dropfrag; } mac_ip6q_create(m, q6); #endif atomic_add_int(&V_frag6_nfragpackets, 1); /* ip6q_nxt will be filled afterwards, from 1st fragment. */ TAILQ_INIT(&q6->ip6q_frags); q6->ip6q_ident = ip6f->ip6f_ident; q6->ip6q_ttl = frag6_milli_to_callout_ticks(V_ip6_fraglifetime); q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; q6->ip6q_ecn = IPV6_ECN(ip6); q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ /* Add the fragemented packet to the bucket. */ TAILQ_INSERT_HEAD(head, q6, ip6q_tq); V_ip6qb[bucket].count++; } /* * If it is the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. * Assume the first 1st fragement to arrive will be correct. * We do not have any duplicate checks here yet so another packet * with fragoff == 0 could come and overwrite the ip6q_unfrglen * and worse, the next header, at any time. */ if (fragoff == 0 && q6->ip6q_unfrglen == -1) { q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag); q6->ip6q_nxt = ip6f->ip6f_nxt; /* XXX ECN? */ } /* * Check that the reassembled packet would not exceed 65535 bytes * in size. * If it would exceed, discard the fragment and return an ICMP error. */ if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { if (only_frag) frag6_rmqueue(q6, bucket); IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); *mp = NULL; return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { if (only_frag) frag6_rmqueue(q6, bucket); IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); *mp = NULL; return (IPPROTO_DONE); } /* * If it is the first fragment, do the above check for each * fragment already stored in the reassembly queue. */ if (fragoff == 0 && !only_frag) { TAILQ_FOREACH_SAFE(af6, &q6->ip6q_frags, ip6af_tq, af6tmp) { if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen > IPV6_MAXPACKET) { struct ip6_hdr *ip6err; struct mbuf *merr; int erroff; merr = af6->ip6af_m; erroff = af6->ip6af_offset; /* Dequeue the fragment. */ TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); q6->ip6q_nfrag--; atomic_subtract_int(&frag6_nfrags, 1); free(af6, M_FRAG6); /* Set a valid receive interface pointer. */ merr->m_pkthdr.rcvif = srcifp; /* Adjust pointer. */ ip6err = mtod(merr, struct ip6_hdr *); /* * Restore source and destination addresses * in the erroneous IPv6 header. */ ip6err->ip6_src = q6->ip6q_src; ip6err->ip6_dst = q6->ip6q_dst; icmp6_error(merr, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); } } } /* Allocate an IPv6 fragement queue entry for this fragmented part. */ ip6af = malloc(sizeof(struct ip6asfrag), M_FRAG6, M_NOWAIT | M_ZERO); if (ip6af == NULL) goto dropfrag; ip6af->ip6af_mff = (ip6f->ip6f_offlg & IP6F_MORE_FRAG) ? true : false; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; ip6af->ip6af_offset = offset; ip6af->ip6af_m = m; if (only_frag) { /* * Do a manual insert rather than a hard-to-understand cast * to a different type relying on data structure order to work. */ TAILQ_INSERT_HEAD(&q6->ip6q_frags, ip6af, ip6af_tq); goto postinsert; } /* Do duplicate, condition, and boundry checks. */ /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * Drop if CE and not-ECT are mixed for the same packet. */ ecn = IPV6_ECN(ip6); ecn0 = q6->ip6q_ecn; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) { free(ip6af, M_FRAG6); goto dropfrag; } if (ecn0 != IPTOS_ECN_CE) q6->ip6q_ecn = IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { free(ip6af, M_FRAG6); goto dropfrag; } /* Find a fragmented part which begins after this one does. */ TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) if (af6->ip6af_off > ip6af->ip6af_off) break; /* * If the incoming framgent overlaps some existing fragments in * the reassembly queue, drop both the new fragment and the * entire reassembly queue. However, if the new fragment * is an exact duplicate of an existing fragment, only silently * drop the existing fragment and leave the fragmentation queue * unchanged, as allowed by the RFC. (RFC 8200, 4.5) */ if (af6 != NULL) af6tmp = TAILQ_PREV(af6, ip6fraghead, ip6af_tq); else af6tmp = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); if (af6tmp != NULL) { if (af6tmp->ip6af_off + af6tmp->ip6af_frglen - ip6af->ip6af_off > 0) { if (af6tmp->ip6af_off != ip6af->ip6af_off || af6tmp->ip6af_frglen != ip6af->ip6af_frglen) frag6_freef(q6, bucket); free(ip6af, M_FRAG6); goto dropfrag; } } if (af6 != NULL) { if (ip6af->ip6af_off + ip6af->ip6af_frglen - af6->ip6af_off > 0) { if (af6->ip6af_off != ip6af->ip6af_off || af6->ip6af_frglen != ip6af->ip6af_frglen) frag6_freef(q6, bucket); free(ip6af, M_FRAG6); goto dropfrag; } } #ifdef MAC mac_ip6q_update(m, q6); #endif /* * Stick new segment in its place; check for complete reassembly. * If not complete, check fragment limit. Move to front of packet * queue, as we are the most recently active fragmented packet. */ if (af6 != NULL) TAILQ_INSERT_BEFORE(af6, ip6af, ip6af_tq); else TAILQ_INSERT_TAIL(&q6->ip6q_frags, ip6af, ip6af_tq); postinsert: atomic_add_int(&frag6_nfrags, 1); q6->ip6q_nfrag++; plen = 0; TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { if (af6->ip6af_off != plen) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); *mp = NULL; return (IPPROTO_DONE); } plen += af6->ip6af_frglen; } af6 = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); if (af6->ip6af_mff) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); *mp = NULL; return (IPPROTO_DONE); } /* Reassembly is complete; concatenate fragments. */ ip6af = TAILQ_FIRST(&q6->ip6q_frags); t = m = ip6af->ip6af_m; TAILQ_REMOVE(&q6->ip6q_frags, ip6af, ip6af_tq); while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { m->m_pkthdr.csum_flags &= af6->ip6af_m->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += af6->ip6af_m->m_pkthdr.csum_data; TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); t = m_last(t); m_adj(af6->ip6af_m, af6->ip6af_offset); m_demote_pkthdr(af6->ip6af_m); m_cat(t, af6->ip6af_m); free(af6, M_FRAG6); } while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); /* Adjust offset to point where the original next header starts. */ offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); free(ip6af, M_FRAG6); if ((u_int)plen + (u_int)offset - sizeof(struct ip6_hdr) > IPV6_MAXPACKET) { frag6_freef(q6, bucket); goto dropfrag; } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr)); if (q6->ip6q_ecn == IPTOS_ECN_CE) ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); nxt = q6->ip6q_nxt; ip6_deletefraghdr(m, offset, M_NOWAIT); /* Set nxt(-hdr field value) to the original value. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); #ifdef MAC mac_ip6q_reassemble(q6, m); #endif atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); frag6_rmqueue(q6, bucket); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; /* Set a valid receive interface pointer. */ m->m_pkthdr.rcvif = srcifp; } #ifdef RSS mtag = m_tag_alloc(MTAG_ABI_IPV6, IPV6_TAG_DIRECT, sizeof(*ip6dc), M_NOWAIT); if (mtag == NULL) goto dropfrag; ip6dc = (struct ip6_direct_ctx *)(mtag + 1); ip6dc->ip6dc_nxt = nxt; ip6dc->ip6dc_off = offset; m_tag_prepend(m, mtag); #endif IP6QB_UNLOCK(bucket); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); #ifdef RSS /* Queue/dispatch for reprocessing. */ netisr_dispatch(NETISR_IPV6_DIRECT, m); *mp = NULL; return (IPPROTO_DONE); #endif /* Tell launch routine the next header. */ *mp = m; *offp = offset; return (nxt); dropfrag: IP6QB_UNLOCK(bucket); dropfrag2: in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); *mp = NULL; return (IPPROTO_DONE); } /* * IPv6 reassembling timer processing; * if a timer expires on a reassembly queue, discard it. */ static struct callout frag6_callout; static void frag6_slowtimo(void *arg __unused) { VNET_ITERATOR_DECL(vnet_iter); struct ip6qhead *head; struct ip6q *q6, *q6tmp; uint32_t bucket; if (atomic_load_int(&frag6_nfrags) == 0) goto done; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { if (V_ip6qb[bucket].count == 0) continue; IP6QB_LOCK(bucket); head = IP6QB_HEAD(bucket); TAILQ_FOREACH_SAFE(q6, head, ip6q_tq, q6tmp) if (--q6->ip6q_ttl == 0) { IP6STAT_ADD(ip6s_fragtimeout, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. * Note that we drain all reassembly queues if * maxfragpackets is 0 (fragmentation is disabled), * and do not enforce a limit when maxfragpackets * is negative. */ while ((V_ip6_maxfragpackets == 0 || (V_ip6_maxfragpackets > 0 && V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) && (q6 = TAILQ_LAST(head, ip6qhead)) != NULL) { IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); } /* * If we are still over the maximum number of fragmented * packets, drain off enough to get down to the new limit. */ bucket = 0; while (V_ip6_maxfragpackets >= 0 && atomic_load_int(&V_frag6_nfragpackets) > (u_int)V_ip6_maxfragpackets) { IP6QB_LOCK(bucket); q6 = TAILQ_LAST(IP6QB_HEAD(bucket), ip6qhead); if (q6 != NULL) { IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); bucket = (bucket + 1) % IP6REASS_NHASH; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); done: callout_reset_sbt(&frag6_callout, SBT_1MS * IP6_CALLOUT_INTERVAL_MS, SBT_1MS * 10, frag6_slowtimo, NULL, 0); } static void frag6_slowtimo_init(void *arg __unused) { callout_init(&frag6_callout, 1); callout_reset_sbt(&frag6_callout, SBT_1MS * IP6_CALLOUT_INTERVAL_MS, SBT_1MS * 10, frag6_slowtimo, NULL, 0); } SYSINIT(frag6, SI_SUB_VNET_DONE, SI_ORDER_ANY, frag6_slowtimo_init, NULL); /* * Eventhandler to adjust limits in case nmbclusters change. */ static void frag6_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); ip6_maxfrags = IP6_MAXFRAGS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Initialise reassembly queue and fragment identifier. */ void frag6_init(void) { uint32_t bucket; V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { TAILQ_INIT(IP6QB_HEAD(bucket)); mtx_init(&V_ip6qb[bucket].lock, "ip6qb", NULL, MTX_DEF); V_ip6qb[bucket].count = 0; } V_ip6qb_hashseed = arc4random(); V_ip6_maxfragsperpacket = 64; #ifdef VIMAGE V_frag6_on = true; #endif if (!IS_DEFAULT_VNET(curvnet)) return; ip6_maxfrags = IP6_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } /* * Drain off all datagram fragments. */ static void frag6_drain_one(void) { struct ip6q *q6; uint32_t bucket; for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { IP6QB_LOCK(bucket); while ((q6 = TAILQ_FIRST(IP6QB_HEAD(bucket))) != NULL) { IP6STAT_INC(ip6s_fragdropped); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6, bucket); } IP6QB_UNLOCK(bucket); } } void -frag6_drain(void) +frag6_drain(void *arg __unused, int flags __unused) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); frag6_drain_one(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } #ifdef VIMAGE /* * Clear up IPv6 reassembly structures. */ void frag6_destroy(void) { uint32_t bucket; frag6_drain_one(); V_frag6_on = false; for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { KASSERT(V_ip6qb[bucket].count == 0, ("%s: V_ip6qb[%d] (%p) count not 0 (%d)", __func__, bucket, &V_ip6qb[bucket], V_ip6qb[bucket].count)); mtx_destroy(&V_ip6qb[bucket].lock); } } #endif diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h index 1aa170f6ed2b..12b00d4f9934 100644 --- a/sys/netinet6/ip6_var.h +++ b/sys/netinet6/ip6_var.h @@ -1,484 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET6_IP6_VAR_H_ #define _NETINET6_IP6_VAR_H_ #include #ifdef _KERNEL struct ip6asfrag; /* frag6.c */ TAILQ_HEAD(ip6fraghead, ip6asfrag); /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { struct ip6fraghead ip6q_frags; u_int32_t ip6q_ident; u_int8_t ip6q_nxt; u_int8_t ip6q_ecn; u_int16_t ip6q_ttl; struct in6_addr ip6q_src, ip6q_dst; TAILQ_ENTRY(ip6q) ip6q_tq; int ip6q_unfrglen; /* len of unfragmentable part */ int ip6q_nfrag; /* # of fragments */ struct label *ip6q_label; }; #endif /* _KERNEL */ /* * IP6 reinjecting structure. */ struct ip6_direct_ctx { uint32_t ip6dc_nxt; /* next header to process */ uint32_t ip6dc_off; /* offset to next header */ }; #if defined(_NETINET6_IN6_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.in6p_moptions and * passed to ip6_output when IPv6 multicast options are in use. * This structure is lazy-allocated. */ struct ip6_moptions { struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ struct ip6_mfilter_head im6o_head; /* group membership list */ }; #else struct ip6_moptions; #endif /* * Control options for outgoing packets */ /* Routing header related info */ struct ip6po_rhinfo { struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */ struct route_in6 ip6po_rhi_route; /* Route to the 1st hop */ }; #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route /* Nexthop related info */ struct ip6po_nhinfo { struct sockaddr *ip6po_nhi_nexthop; struct route_in6 ip6po_nhi_route; /* Route to the nexthop */ }; #define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop #define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route /* * Note that fields with valid data must be flagged in ip6po_valid. * This is done to reduce cache misses in ip6_output(). Before * ip6po_valid, ip6_output needed to check all the individual fields * of ip6_pktopts needed to be checked themselves, and they are spread * across 4 cachelines. ip6_output() is currently the only consumer of * these flags, as it is in the critical path of every packet sent. */ struct ip6_pktopts { uint32_t ip6po_valid; #define IP6PO_VALID_HLIM 0x0001 #define IP6PO_VALID_PKTINFO 0x0002 #define IP6PO_VALID_NHINFO 0x0004 #define IP6PO_VALID_HBH 0x0008 #define IP6PO_VALID_DEST1 0x0010 #define IP6PO_VALID_RHINFO 0x0020 #define IP6PO_VALID_DEST2 0x0040 #define IP6PO_VALID_TC 0x0080 int ip6po_hlim; /* Hoplimit for outgoing packets */ int ip6po_tclass; /* traffic class */ int ip6po_minmtu; /* fragment vs PMTU discovery policy */ #define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ #define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ #define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ int ip6po_prefer_tempaddr; /* whether temporary addresses are preferred as source address */ #define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ #define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ #define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ int ip6po_flags; #if 0 /* parameters in this block is obsolete. do not reuse the values. */ #define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ #define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #endif #define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ #define IP6PO_USECOA 0x08 /* use care of address */ struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; /* Next-hop address information */ struct ip6po_nhinfo ip6po_nhinfo; struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ struct ip6_dest *ip6po_dest1; /* Routing header related info. */ struct ip6po_rhinfo ip6po_rhinfo; /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; }; /* * Control options for incoming packets */ struct ip6stat { uint64_t ip6s_total; /* total packets received */ uint64_t ip6s_tooshort; /* packet too short */ uint64_t ip6s_toosmall; /* not enough data */ uint64_t ip6s_fragments; /* fragments received */ uint64_t ip6s_fragdropped; /* frags dropped(dups, out of space) */ uint64_t ip6s_fragtimeout; /* fragments timed out */ uint64_t ip6s_fragoverflow; /* fragments that exceeded limit */ uint64_t ip6s_forward; /* packets forwarded */ uint64_t ip6s_cantforward; /* packets rcvd for unreachable dest */ uint64_t ip6s_redirectsent; /* packets forwarded on same net */ uint64_t ip6s_delivered; /* datagrams delivered to upper level*/ uint64_t ip6s_localout; /* total ip packets generated here */ uint64_t ip6s_odropped; /* lost packets due to nobufs, etc. */ uint64_t ip6s_reassembled; /* total packets reassembled ok */ uint64_t ip6s_atomicfrags; /* atomic fragments */ uint64_t ip6s_fragmented; /* datagrams successfully fragmented */ uint64_t ip6s_ofragments; /* output fragments created */ uint64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ uint64_t ip6s_badoptions; /* error in option processing */ uint64_t ip6s_noroute; /* packets discarded due to no route */ uint64_t ip6s_badvers; /* ip6 version != 6 */ uint64_t ip6s_rawout; /* total raw ip packets generated */ uint64_t ip6s_badscope; /* scope error */ uint64_t ip6s_notmember; /* don't join this multicast group */ #define IP6S_HDRCNT 256 /* headers count */ uint64_t ip6s_nxthist[IP6S_HDRCNT]; /* next header history */ uint64_t ip6s_m1; /* one mbuf */ #define IP6S_M2MMAX 32 uint64_t ip6s_m2m[IP6S_M2MMAX]; /* two or more mbuf */ uint64_t ip6s_mext1; /* one ext mbuf */ uint64_t ip6s_mext2m; /* two or more ext mbuf */ uint64_t ip6s_exthdrtoolong; /* ext hdr are not contiguous */ uint64_t ip6s_nogif; /* no match gif found */ uint64_t ip6s_toomanyhdr; /* discarded due to too many headers */ /* * statistics for improvement of the source address selection * algorithm: * XXX: hardcoded 16 = # of ip6 multicast scope types + 1 */ #define IP6S_RULESMAX 16 #define IP6S_SCOPECNT 16 /* number of times that address selection fails */ uint64_t ip6s_sources_none; /* number of times that an address on the outgoing I/F is chosen */ uint64_t ip6s_sources_sameif[IP6S_SCOPECNT]; /* number of times that an address on a non-outgoing I/F is chosen */ uint64_t ip6s_sources_otherif[IP6S_SCOPECNT]; /* * number of times that an address that has the same scope * from the destination is chosen. */ uint64_t ip6s_sources_samescope[IP6S_SCOPECNT]; /* * number of times that an address that has a different scope * from the destination is chosen. */ uint64_t ip6s_sources_otherscope[IP6S_SCOPECNT]; /* number of times that a deprecated address is chosen */ uint64_t ip6s_sources_deprecated[IP6S_SCOPECNT]; /* number of times that each rule of source selection is applied. */ uint64_t ip6s_sources_rule[IP6S_RULESMAX]; }; #ifdef _KERNEL #include #include VNET_PCPUSTAT_DECLARE(struct ip6stat, ip6stat); #define IP6STAT_ADD(name, val) \ do { \ MIB_SDT_PROBE1(ip6, count, name, (val)); \ VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, (val)); \ } while (0) #define IP6STAT_SUB(name, val) IP6STAT_ADD(name, -(val)) #define IP6STAT_INC(name) IP6STAT_ADD(name, 1) #define IP6STAT_INC2(name, type) \ do { \ MIB_SDT_PROBE2(ip6, count, name, 1, type); \ VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, 1); \ } while (0) #define IP6STAT_DEC(name) IP6STAT_SUB(name, 1) #endif #ifdef _KERNEL /* flags passed to ip6_output as last parameter */ #define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #ifdef __NO_STRICT_ALIGNMENT #define IP6_HDR_ALIGNED_P(ip) 1 #else #define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif VNET_DECLARE(int, ip6_defhlim); /* default hop limit */ VNET_DECLARE(int, ip6_defmcasthlim); /* default multicast hop limit */ VNET_DECLARE(int, ip6_forwarding); /* act as router? */ VNET_DECLARE(int, ip6_use_deprecated); /* allow deprecated addr as source */ VNET_DECLARE(int, ip6_rr_prune); /* router renumbering prefix * walk list every 5 sec. */ VNET_DECLARE(int, ip6_mcast_pmtu); /* enable pMTU discovery for multicast? */ VNET_DECLARE(int, ip6_v6only); #define V_ip6_defhlim VNET(ip6_defhlim) #define V_ip6_defmcasthlim VNET(ip6_defmcasthlim) #define V_ip6_forwarding VNET(ip6_forwarding) #define V_ip6_use_deprecated VNET(ip6_use_deprecated) #define V_ip6_rr_prune VNET(ip6_rr_prune) #define V_ip6_mcast_pmtu VNET(ip6_mcast_pmtu) #define V_ip6_v6only VNET(ip6_v6only) VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA * receiving IF. */ VNET_DECLARE(int, ip6_rfc6204w3); /* Accept defroute from RA even when forwarding enabled */ VNET_DECLARE(int, ip6_hdrnestlimit); /* upper limit of # of extension * headers */ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_sendredirects VNET(ip6_sendredirects) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_norbit_raif VNET(ip6_norbit_raif) #define V_ip6_rfc6204w3 VNET(ip6_rfc6204w3) #define V_ip6_hdrnestlimit VNET(ip6_hdrnestlimit) #define V_ip6_dad_count VNET(ip6_dad_count) VNET_DECLARE(int, ip6_auto_flowlabel); VNET_DECLARE(int, ip6_auto_linklocal); #define V_ip6_auto_flowlabel VNET(ip6_auto_flowlabel) #define V_ip6_auto_linklocal VNET(ip6_auto_linklocal) VNET_DECLARE(int, ip6_use_tempaddr); /* Whether to use temporary addresses */ VNET_DECLARE(int, ip6_prefer_tempaddr); /* Whether to prefer temporary * addresses in the source address * selection */ #define V_ip6_use_tempaddr VNET(ip6_use_tempaddr) #define V_ip6_prefer_tempaddr VNET(ip6_prefer_tempaddr) VNET_DECLARE(int, ip6_use_defzone); /* Whether to use the default scope * zone when unspecified */ #define V_ip6_use_defzone VNET(ip6_use_defzone) VNET_DECLARE(struct pfil_head *, inet6_pfil_head); #define V_inet6_pfil_head VNET(inet6_pfil_head) #define PFIL_INET6_NAME "inet6" VNET_DECLARE(struct pfil_head *, inet6_local_pfil_head); #define V_inet6_local_pfil_head VNET(inet6_local_pfil_head) #define PFIL_INET6_LOCAL_NAME "inet6-local" #ifdef IPSTEALTH VNET_DECLARE(int, ip6stealth); #define V_ip6stealth VNET(ip6stealth) #endif VNET_DECLARE(bool, ip6_log_cannot_forward); #define V_ip6_log_cannot_forward VNET(ip6_log_cannot_forward) extern struct pr_usrreqs rip6_usrreqs; struct sockopt; struct inpcb; struct ucred; int icmp6_ctloutput(struct socket *, struct sockopt *sopt); void ip6_input(struct mbuf *); void ip6_direct_input(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); int ip6_get_prevhdr(const struct mbuf *, int); int ip6_nexthdr(const struct mbuf *, int, int, int *); int ip6_lasthdr(const struct mbuf *, int, int, int *); extern int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, struct mbuf **, int *); void ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); void ip6_notify_pmtu(struct inpcb *, struct sockaddr_in6 *, u_int32_t); int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t); void ip6_forward(struct mbuf *, int); void ip6_mloopback(struct ifnet *, struct mbuf *); int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int, struct ip6_moptions *, struct ifnet **, struct inpcb *); int ip6_ctloutput(struct socket *, struct sockopt *); int ip6_raw_ctloutput(struct socket *, struct sockopt *); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *, struct ip6_pktopts *, struct ucred *, int); void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); int ip6_deletefraghdr(struct mbuf *, int, int); int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int, uint32_t); int route6_input(struct mbuf **, int *, int); void frag6_init(void); void frag6_destroy(void); int frag6_input(struct mbuf **, int *, int); -void frag6_drain(void); +void frag6_drain(void *, int); void rip6_init(void); int rip6_ctloutput(struct socket *, struct sockopt *); int rip6_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *); int dest6_input(struct mbuf **, int *, int); int none_input(struct mbuf **, int *, int); int in6_selectsrc_socket(struct sockaddr_in6 *, struct ip6_pktopts *, struct inpcb *, struct ucred *, int, struct in6_addr *, int *); int in6_selectsrc_addr(uint32_t, const struct in6_addr *, uint32_t, struct ifnet *, struct in6_addr *, int *); int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct nhop_object **, u_int, uint32_t); u_int32_t ip6_randomid(void); u_int32_t ip6_randomflowlabel(void); void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset); int ip6_log_ratelimit(void); /* * Argument type for the last arg of ip6proto_ctlinput_t(). * * IPv6 ICMP IPv6 [exthdrs] finalhdr payload * ^ ^ ^ ^ * | | ip6c_ip6 ip6c_off * | ip6c_icmp6 * ip6c_m * * ip6c_finaldst's sin6_addr usually points to ip6c_ip6->ip6_dst. If the * original * (internal) packet carries a routing header, it may point the * final * destination address in the routing header. * * ip6c_src: ip6c_ip6->ip6_src + scope info + flowlabel in ip6c_ip6 * (beware of flowlabel, if you try to compare it against others) * ip6c_dst: ip6c_finaldst + scope info */ struct ip6ctlparam { struct mbuf *ip6c_m; /* start of mbuf chain */ struct icmp6_hdr *ip6c_icmp6; /* icmp6 header of target packet */ struct ip6_hdr *ip6c_ip6; /* ip6 header of target packet */ int ip6c_off; /* offset of the target proto header */ struct sockaddr_in6 *ip6c_src; /* srcaddr w/ additional info */ struct sockaddr_in6 *ip6c_dst; /* (final) dstaddr w/ additional info */ struct sockaddr_in6 *ip6c_finaldst; /* final destination address */ void *ip6c_cmdarg; /* control command dependent data */ u_int8_t ip6c_nxt; /* final next header field */ }; typedef int ip6proto_input_t(struct mbuf **, int *, int); typedef void ip6proto_ctlinput_t(struct ip6ctlparam *); int ip6proto_register(uint8_t, ip6proto_input_t, ip6proto_ctlinput_t); int ip6proto_unregister(uint8_t); #define IP6PROTO_REGISTER(prot, input, ctl) do { \ int error __diagused; \ error = ip6proto_register(prot, input, ctl); \ MPASS(error == 0); \ } while (0) ip6proto_input_t rip6_input; ip6proto_ctlinput_t rip6_ctlinput; #endif /* _KERNEL */ #endif /* !_NETINET6_IP6_VAR_H_ */ diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c index 92f067f5af8c..dd0405d3aeac 100644 --- a/sys/ufs/ufs/ufs_dirhash.c +++ b/sys/ufs/ufs/ufs_dirhash.c @@ -1,1324 +1,1324 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This implements a hash-based lookup scheme for UFS directories. */ #include #include "opt_ufs.h" #ifdef UFS_DIRHASH #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); static int ufs_mindirhashsize = DIRBLKSIZ * 5; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, &ufs_mindirhashsize, 0, "minimum directory size in bytes for which to use hashed lookup"); static int ufs_dirhashmaxmem = 2 * 1024 * 1024; /* NOTE: initial value. It is tuned in ufsdirhash_init() */ SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem, 0, "maximum allowed dirhash memory usage"); static int ufs_dirhashmem; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem, 0, "current dirhash memory usage"); static int ufs_dirhashcheck = 0; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck, 0, "enable extra sanity tests"); static int ufs_dirhashlowmemcount = 0; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD, &ufs_dirhashlowmemcount, 0, "number of times low memory hook called"); static int ufs_dirhashreclaimpercent = 10; static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, ufsdirhash_set_reclaimpercent, "I", "set percentage of dirhash cache to be removed in low VM events"); static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); static void ufsdirhash_delslot(struct dirhash *dh, int slot); static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset); static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); static int ufsdirhash_recycle(int wanted); -static void ufsdirhash_lowmem(void); +static void ufsdirhash_lowmem(void *, int); static void ufsdirhash_free_locked(struct inode *ip); static uma_zone_t ufsdirhash_zone; #define DIRHASHLIST_LOCK() mtx_lock(&ufsdirhash_mtx) #define DIRHASHLIST_UNLOCK() mtx_unlock(&ufsdirhash_mtx) #define DIRHASH_BLKALLOC() uma_zalloc(ufsdirhash_zone, M_NOWAIT) #define DIRHASH_BLKFREE(ptr) uma_zfree(ufsdirhash_zone, (ptr)) #define DIRHASH_ASSERT_LOCKED(dh) \ sx_assert(&(dh)->dh_lock, SA_LOCKED) /* Dirhash list; recently-used entries are near the tail. */ static TAILQ_HEAD(, dirhash) ufsdirhash_list; /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ static struct mtx ufsdirhash_mtx; /* * Locking: * * The relationship between inode and dirhash is protected either by an * exclusive vnode lock or the vnode interlock where a shared vnode lock * may be used. The dirhash_mtx is acquired after the dirhash lock. To * handle teardown races, code wishing to lock the dirhash for an inode * when using a shared vnode lock must obtain a private reference on the * dirhash while holding the vnode interlock. They can drop it once they * have obtained the dirhash lock and verified that the dirhash wasn't * recycled while they waited for the dirhash lock. * * ufsdirhash_build() acquires a shared lock on the dirhash when it is * successful. This lock is released after a call to ufsdirhash_lookup(). * * Functions requiring exclusive access use ufsdirhash_acquire() which may * free a dirhash structure that was recycled by ufsdirhash_recycle(). * * The dirhash lock may be held across io operations. * * WITNESS reports a lock order reversal between the "bufwait" lock * and the "dirhash" lock. However, this specific reversal will not * cause a deadlock. To get a deadlock, one would have to lock a * buffer followed by the dirhash while a second thread locked a * buffer while holding the dirhash lock. The second order can happen * under a shared or exclusive vnode lock for the associated directory * in lookup(). The first order, however, can only happen under an * exclusive vnode lock (e.g. unlink(), rename(), etc.). Thus, for * a thread to be doing a "bufwait" -> "dirhash" order, it has to hold * an exclusive vnode lock. That exclusive vnode lock will prevent * any other threads from doing a "dirhash" -> "bufwait" order. */ static void ufsdirhash_hold(struct dirhash *dh) { refcount_acquire(&dh->dh_refcount); } static void ufsdirhash_drop(struct dirhash *dh) { if (refcount_release(&dh->dh_refcount)) { sx_destroy(&dh->dh_lock); free(dh, M_DIRHASH); } } /* * Release the lock on a dirhash. */ static void ufsdirhash_release(struct dirhash *dh) { sx_unlock(&dh->dh_lock); } /* * Either acquire an existing hash locked shared or create a new hash and * return it exclusively locked. May return NULL if the allocation fails. * * The vnode interlock is used to protect the i_dirhash pointer from * simultaneous access while only a shared vnode lock is held. */ static struct dirhash * ufsdirhash_create(struct inode *ip) { struct dirhash *ndh; struct dirhash *dh; struct vnode *vp; bool excl; ndh = dh = NULL; vp = ip->i_vnode; excl = false; for (;;) { /* Racy check for i_dirhash to prefetch a dirhash structure. */ if (ip->i_dirhash == NULL && ndh == NULL) { ndh = malloc(sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO); if (ndh == NULL) return (NULL); refcount_init(&ndh->dh_refcount, 1); /* * The DUPOK is to prevent warnings from the * sx_slock() a few lines down which is safe * since the duplicate lock in that case is * the one for this dirhash we are creating * now which has no external references until * after this function returns. */ sx_init_flags(&ndh->dh_lock, "dirhash", SX_DUPOK); sx_xlock(&ndh->dh_lock); } /* * Check i_dirhash. If it's NULL just try to use a * preallocated structure. If none exists loop and try again. */ VI_LOCK(vp); dh = ip->i_dirhash; if (dh == NULL) { ip->i_dirhash = ndh; VI_UNLOCK(vp); if (ndh == NULL) continue; return (ndh); } ufsdirhash_hold(dh); VI_UNLOCK(vp); /* Acquire a lock on existing hashes. */ if (excl) sx_xlock(&dh->dh_lock); else sx_slock(&dh->dh_lock); /* The hash could've been recycled while we were waiting. */ VI_LOCK(vp); if (ip->i_dirhash != dh) { VI_UNLOCK(vp); ufsdirhash_release(dh); ufsdirhash_drop(dh); continue; } VI_UNLOCK(vp); ufsdirhash_drop(dh); /* If the hash is still valid we've succeeded. */ if (dh->dh_hash != NULL) break; /* * If the hash is NULL it has been recycled. Try to upgrade * so we can recreate it. If we fail the upgrade, drop our * lock and try again. */ if (excl || sx_try_upgrade(&dh->dh_lock)) break; sx_sunlock(&dh->dh_lock); excl = true; } /* Free the preallocated structure if it was not necessary. */ if (ndh) { ufsdirhash_release(ndh); ufsdirhash_drop(ndh); } return (dh); } /* * Acquire an exclusive lock on an existing hash. Requires an exclusive * vnode lock to protect the i_dirhash pointer. hashes that have been * recycled are reclaimed here and NULL is returned. */ static struct dirhash * ufsdirhash_acquire(struct inode *ip) { struct dirhash *dh; ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__); dh = ip->i_dirhash; if (dh == NULL) return (NULL); sx_xlock(&dh->dh_lock); if (dh->dh_hash != NULL) return (dh); ufsdirhash_free_locked(ip); return (NULL); } /* * Acquire exclusively and free the hash pointed to by ip. Works with a * shared or exclusive vnode lock. */ void ufsdirhash_free(struct inode *ip) { struct dirhash *dh; struct vnode *vp; vp = ip->i_vnode; for (;;) { /* Grab a reference on this inode's dirhash if it has one. */ VI_LOCK(vp); dh = ip->i_dirhash; if (dh == NULL) { VI_UNLOCK(vp); return; } ufsdirhash_hold(dh); VI_UNLOCK(vp); /* Exclusively lock the dirhash. */ sx_xlock(&dh->dh_lock); /* If this dirhash still belongs to this inode, then free it. */ VI_LOCK(vp); if (ip->i_dirhash == dh) { VI_UNLOCK(vp); ufsdirhash_drop(dh); break; } VI_UNLOCK(vp); /* * This inode's dirhash has changed while we were * waiting for the dirhash lock, so try again. */ ufsdirhash_release(dh); ufsdirhash_drop(dh); } ufsdirhash_free_locked(ip); } /* * Attempt to build up a hash table for the directory contents in * inode 'ip'. Returns 0 on success, or -1 of the operation failed. */ int ufsdirhash_build(struct inode *ip) { struct dirhash *dh; struct buf *bp = NULL; struct direct *ep; struct vnode *vp; doff_t bmask, pos; uint64_t dirblocks, i, narrays, nblocks, nslots; int j, memreqd, slot; /* Take care of a decreased sysctl value. */ while (ufs_dirhashmem > ufs_dirhashmaxmem) { if (ufsdirhash_recycle(0) != 0) return (-1); /* Recycled enough memory, so unlock the list. */ DIRHASHLIST_UNLOCK(); } /* Check if we can/should use dirhash. */ if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) || ip->i_effnlink == 0) { if (ip->i_dirhash) ufsdirhash_free(ip); return (-1); } dh = ufsdirhash_create(ip); if (dh == NULL) return (-1); if (dh->dh_hash != NULL) return (0); vp = ip->i_vnode; /* Allocate 50% more entries than this dir size could ever need. */ KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size")); nslots = ip->i_size / DIRECTSIZ(1); nslots = (nslots * 3 + 1) / 2; narrays = howmany(nslots, DH_NBLKOFF); nslots = narrays * DH_NBLKOFF; dirblocks = howmany(ip->i_size, DIRBLKSIZ); nblocks = (dirblocks * 3 + 1) / 2; memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + nblocks * sizeof(*dh->dh_blkfree); DIRHASHLIST_LOCK(); if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { DIRHASHLIST_UNLOCK(); if (memreqd > ufs_dirhashmaxmem / 2) goto fail; /* Try to free some space. */ if (ufsdirhash_recycle(memreqd) != 0) goto fail; /* Enough was freed, and list has been locked. */ } ufs_dirhashmem += memreqd; DIRHASHLIST_UNLOCK(); /* Initialise the hash table and block statistics. */ dh->dh_memreq = memreqd; dh->dh_narrays = narrays; dh->dh_hlen = nslots; dh->dh_nblk = nblocks; dh->dh_dirblks = dirblocks; for (i = 0; i < DH_NFSTATS; i++) dh->dh_firstfree[i] = -1; dh->dh_firstfree[DH_NFSTATS] = 0; dh->dh_hused = 0; dh->dh_seqoff = -1; dh->dh_score = DH_SCOREINIT; dh->dh_lastused = time_second; /* * Use non-blocking mallocs so that we will revert to a linear * lookup on failure rather than potentially blocking forever. */ dh->dh_hash = malloc(narrays * sizeof(dh->dh_hash[0]), M_DIRHASH, M_NOWAIT | M_ZERO); if (dh->dh_hash == NULL) goto fail; dh->dh_blkfree = malloc(nblocks * sizeof(dh->dh_blkfree[0]), M_DIRHASH, M_NOWAIT); if (dh->dh_blkfree == NULL) goto fail; for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL) goto fail; for (j = 0; j < DH_NBLKOFF; j++) dh->dh_hash[i][j] = DIRHASH_EMPTY; } for (i = 0; i < dirblocks; i++) dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; bmask = vp->v_mount->mnt_stat.f_iosize - 1; pos = 0; while (pos < ip->i_size) { /* If necessary, get the next directory block. */ if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp); if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0) goto fail; } /* Add this entry to the hash. */ ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ brelse(bp); goto fail; } if (ep->d_ino != 0) { /* Add the entry (simplified ufsdirhash_add). */ slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); dh->dh_hused++; DH_ENTRY(dh, slot) = pos; ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); } pos += ep->d_reclen; } if (bp != NULL) brelse(bp); DIRHASHLIST_LOCK(); TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 1; DIRHASHLIST_UNLOCK(); sx_downgrade(&dh->dh_lock); return (0); fail: ufsdirhash_free_locked(ip); return (-1); } /* * Free any hash table associated with inode 'ip'. */ static void ufsdirhash_free_locked(struct inode *ip) { struct dirhash *dh; struct vnode *vp; int i; DIRHASH_ASSERT_LOCKED(ip->i_dirhash); /* * Clear the pointer in the inode to prevent new threads from * finding the dead structure. */ vp = ip->i_vnode; VI_LOCK(vp); dh = ip->i_dirhash; ip->i_dirhash = NULL; VI_UNLOCK(vp); /* * Remove the hash from the list since we are going to free its * memory. */ DIRHASHLIST_LOCK(); if (dh->dh_onlist) TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); ufs_dirhashmem -= dh->dh_memreq; DIRHASHLIST_UNLOCK(); /* * At this point, any waiters for the lock should hold their * own reference on the dirhash structure. They will drop * that reference once they grab the vnode interlock and see * that ip->i_dirhash is NULL. */ sx_xunlock(&dh->dh_lock); /* * Handle partially recycled as well as fully constructed hashes. */ if (dh->dh_hash != NULL) { for (i = 0; i < dh->dh_narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); free(dh->dh_hash, M_DIRHASH); if (dh->dh_blkfree != NULL) free(dh->dh_blkfree, M_DIRHASH); } /* * Drop the inode's reference to the data structure. */ ufsdirhash_drop(dh); } /* * Find the offset of the specified name within the given inode. * Returns 0 on success, ENOENT if the entry does not exist, or * EJUSTRETURN if the caller should revert to a linear search. * * If successful, the directory offset is stored in *offp, and a * pointer to a struct buf containing the entry is stored in *bpp. If * prevoffp is non-NULL, the offset of the previous entry within * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry * is the first in a block, the start of the block is used). * * Must be called with the hash locked. Returns with the hash unlocked. */ int ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, struct buf **bpp, doff_t *prevoffp) { struct dirhash *dh, *dh_next; struct direct *dp; struct vnode *vp; struct buf *bp; doff_t blkoff, bmask, offset, prevoff, seqoff; int i, slot; int error; dh = ip->i_dirhash; KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_lookup: Invalid dirhash %p\n", dh)); DIRHASH_ASSERT_LOCKED(dh); /* * Move this dirhash towards the end of the list if it has a * score higher than the next entry, and acquire the dh_lock. */ DIRHASHLIST_LOCK(); if (TAILQ_NEXT(dh, dh_list) != NULL) { /* * If the new score will be greater than that of the next * entry, then move this entry past it. With both mutexes * held, dh_next won't go away, but its dh_score could * change; that's not important since it is just a hint. */ if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && dh->dh_score >= dh_next->dh_score) { KASSERT(dh->dh_onlist, ("dirhash: not on list")); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, dh_list); } } /* Update the score. */ if (dh->dh_score < DH_SCOREMAX) dh->dh_score++; /* Update last used time. */ dh->dh_lastused = time_second; DIRHASHLIST_UNLOCK(); vp = ip->i_vnode; bmask = vp->v_mount->mnt_stat.f_iosize - 1; blkoff = -1; bp = NULL; seqoff = dh->dh_seqoff; restart: slot = ufsdirhash_hash(dh, name, namelen); if (seqoff != -1) { /* * Sequential access optimisation. seqoff contains the * offset of the directory entry immediately following * the last entry that was looked up. Check if this offset * appears in the hash chain for the name we are looking for. */ for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; i = WRAPINCR(i, dh->dh_hlen)) if (offset == seqoff) break; if (offset == seqoff) { /* * We found an entry with the expected offset. This * is probably the entry we want, but if not, the * code below will retry. */ slot = i; } else seqoff = -1; } for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) { if (offset == DIRHASH_DEL) continue; if (offset < 0 || offset >= ip->i_size) panic("ufsdirhash_lookup: bad offset in hash array"); if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp); blkoff = offset & ~bmask; if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) { error = EJUSTRETURN; goto fail; } } KASSERT(bp != NULL, ("no buffer allocated")); dp = (struct direct *)(bp->b_data + (offset & bmask)); if (dp->d_reclen == 0 || dp->d_reclen > DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ error = EJUSTRETURN; goto fail; } if (dp->d_namlen == namelen && bcmp(dp->d_name, name, namelen) == 0) { /* Found. Get the prev offset if needed. */ if (prevoffp != NULL) { if (offset & (DIRBLKSIZ - 1)) { prevoff = ufsdirhash_getprev(dp, offset); if (prevoff == -1) { error = EJUSTRETURN; goto fail; } } else prevoff = offset; *prevoffp = prevoff; } /* Update offset. */ dh->dh_seqoff = offset + DIRSIZ(0, dp); *bpp = bp; *offp = offset; ufsdirhash_release(dh); return (0); } /* * When the name doesn't match in the sequential * optimization case, go back and search normally. */ if (seqoff != -1) { seqoff = -1; goto restart; } } error = ENOENT; fail: ufsdirhash_release(dh); if (bp != NULL) brelse(bp); return (error); } /* * Find a directory block with room for 'slotneeded' bytes. Returns * the offset of the directory entry that begins the free space. * This will either be the offset of an existing entry that has free * space at the end, or the offset of an entry with d_ino == 0 at * the start of a DIRBLKSIZ block. * * To use the space, the caller may need to compact existing entries in * the directory. The total number of bytes in all of the entries involved * in the compaction is stored in *slotsize. In other words, all of * the entries that must be compacted are exactly contained in the * region beginning at the returned offset and spanning *slotsize bytes. * * Returns -1 if no space was found, indicating that the directory * must be extended. */ doff_t ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) { struct direct *dp; struct dirhash *dh; struct buf *bp; doff_t pos, slotstart; int dirblock, error, freebytes, i; dh = ip->i_dirhash; KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_findfree: Invalid dirhash %p\n", dh)); DIRHASH_ASSERT_LOCKED(dh); /* Find a directory block with the desired free space. */ dirblock = -1; for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) if ((dirblock = dh->dh_firstfree[i]) != -1) break; if (dirblock == -1) return (-1); KASSERT(dirblock < dh->dh_nblk && dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), ("ufsdirhash_findfree: bad stats")); pos = dirblock * DIRBLKSIZ; error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp); if (error) return (-1); /* Find the first entry with free space. */ for (i = 0; i < DIRBLKSIZ; ) { if (dp->d_reclen == 0) { brelse(bp); return (-1); } if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) break; i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } slotstart = pos + i; /* Find the range of entries needed to get enough space */ freebytes = 0; while (i < DIRBLKSIZ && freebytes < slotneeded) { freebytes += dp->d_reclen; if (dp->d_ino != 0) freebytes -= DIRSIZ(0, dp); if (dp->d_reclen == 0) { brelse(bp); return (-1); } i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } if (freebytes < slotneeded) panic("ufsdirhash_findfree: free mismatch"); brelse(bp); *slotsize = pos + i - slotstart; return (slotstart); } /* * Return the start of the unused space at the end of a directory, or * -1 if there are no trailing unused blocks. */ doff_t ufsdirhash_enduseful(struct inode *ip) { struct dirhash *dh; int i; dh = ip->i_dirhash; DIRHASH_ASSERT_LOCKED(dh); KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh)); if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) return (-1); for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) break; return ((doff_t)(i + 1) * DIRBLKSIZ); } /* * Insert information into the hash about a new directory entry. dirp * points to a struct direct containing the entry, and offset specifies * the offset of this entry. */ void ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_add: bad offset")); /* * Normal hash usage is < 66%. If the usage gets too high then * remove the hash entirely and let it be rebuilt later. */ if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { ufsdirhash_free_locked(ip); return; } /* Find a free hash slot (empty or deleted), and add the entry. */ slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); while (DH_ENTRY(dh, slot) >= 0) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) dh->dh_hused++; DH_ENTRY(dh, slot) = offset; /* Update last used time. */ dh->dh_lastused = time_second; /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); ufsdirhash_release(dh); } /* * Remove the specified directory entry from the hash. The entry to remove * is defined by the name in `dirp', which must exist at the specified * `offset' within the directory. */ void ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_remove: bad offset")); /* Find the entry */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); /* Remove the hash entry. */ ufsdirhash_delslot(dh, slot); /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); ufsdirhash_release(dh); } /* * Change the offset associated with a directory entry in the hash. Used * when compacting directory blocks. */ void ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, doff_t newoff) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && newoff < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_move: bad offset")); /* Find the entry, and update the offset. */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); DH_ENTRY(dh, slot) = newoff; ufsdirhash_release(dh); } /* * Inform dirhash that the directory has grown by one block that * begins at offset (i.e. the new length is offset + DIRBLKSIZ). */ void ufsdirhash_newblk(struct inode *ip, doff_t offset) { struct dirhash *dh; int block; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_newblk: bad offset")); block = offset / DIRBLKSIZ; if (block >= dh->dh_nblk) { /* Out of space; must rebuild. */ ufsdirhash_free_locked(ip); return; } dh->dh_dirblks = block + 1; /* Account for the new free block. */ dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; if (dh->dh_firstfree[DH_NFSTATS] == -1) dh->dh_firstfree[DH_NFSTATS] = block; ufsdirhash_release(dh); } /* * Inform dirhash that the directory is being truncated. */ void ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) { struct dirhash *dh; int block, i; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_dirtrunc: bad offset")); block = howmany(offset, DIRBLKSIZ); /* * If the directory shrinks to less than 1/8 of dh_nblk blocks * (about 20% of its original size due to the 50% extra added in * ufsdirhash_build) then free it, and let the caller rebuild * if necessary. */ if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { ufsdirhash_free_locked(ip); return; } /* * Remove any `first free' information pertaining to the * truncated blocks. All blocks we're removing should be * completely unused. */ if (dh->dh_firstfree[DH_NFSTATS] >= block) dh->dh_firstfree[DH_NFSTATS] = -1; for (i = block; i < dh->dh_dirblks; i++) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) panic("ufsdirhash_dirtrunc: blocks in use"); for (i = 0; i < DH_NFSTATS; i++) if (dh->dh_firstfree[i] >= block) panic("ufsdirhash_dirtrunc: first free corrupt"); dh->dh_dirblks = block; ufsdirhash_release(dh); } /* * Debugging function to check that the dirhash information about * a directory block matches its actual contents. Panics if a mismatch * is detected. * * On entry, `buf' should point to the start of an in-core * DIRBLKSIZ-sized directory block, and `offset' should contain the * offset from the start of the directory of that block. */ void ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) { struct dirhash *dh; struct direct *dp; int block, ffslot, i, nfree; if (!ufs_dirhashcheck) return; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; block = offset / DIRBLKSIZ; if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) panic("ufsdirhash_checkblock: bad offset"); nfree = 0; for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { dp = (struct direct *)(buf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir"); if (dp->d_ino == 0) { #if 0 /* * XXX entries with d_ino == 0 should only occur * at the start of a DIRBLKSIZ block. However the * ufs code is tolerant of such entries at other * offsets, and fsck does not fix them. */ if (i != 0) panic("ufsdirhash_checkblock: bad dir inode"); #endif nfree += dp->d_reclen; continue; } /* Check that the entry exists (will panic if it doesn't). */ ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); nfree += dp->d_reclen - DIRSIZ(0, dp); } if (i != DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir end"); if (dh->dh_blkfree[block] * DIRALIGN != nfree) panic("ufsdirhash_checkblock: bad free count"); ffslot = BLKFREE2IDX(nfree / DIRALIGN); for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot) panic("ufsdirhash_checkblock: bad first-free"); if (dh->dh_firstfree[ffslot] == -1) panic("ufsdirhash_checkblock: missing first-free entry"); ufsdirhash_release(dh); } /* * Hash the specified filename into a dirhash slot. */ static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) { uint32_t hash; /* * We hash the name and then some other bit of data that is * invariant over the dirhash's lifetime. Otherwise names * differing only in the last byte are placed close to one * another in the table, which is bad for linear probing. */ hash = fnv_32_buf(name, namelen, FNV1_32_INIT); hash = fnv_32_buf(&dh, sizeof(dh), hash); return (hash % dh->dh_hlen); } /* * Adjust the number of free bytes in the block containing `offset' * by the value specified by `diff'. * * The caller must ensure we have exclusive access to `dh'; normally * that means that dh_lock should be held, but this is also called * from ufsdirhash_build() where exclusive access can be assumed. */ static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) { int block, i, nfidx, ofidx; /* Update the per-block summary info. */ block = offset / DIRBLKSIZ; KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, ("dirhash bad offset")); ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); /* Update the `first free' list if necessary. */ if (ofidx != nfidx) { /* If removing, scan forward for the next block. */ if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) break; dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; } /* Make this the new `first free' if necessary */ if (dh->dh_firstfree[nfidx] > block || dh->dh_firstfree[nfidx] == -1) dh->dh_firstfree[nfidx] = block; } } /* * Find the specified name which should have the specified offset. * Returns a slot number, and panics on failure. * * `dh' must be locked on entry and remains so on return. */ static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) { int slot; DIRHASH_ASSERT_LOCKED(dh); /* Find the entry. */ KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset && DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) != offset) panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot); } /* * Remove the entry corresponding to the specified slot from the hash array. * * `dh' must be locked on entry and remains so on return. */ static void ufsdirhash_delslot(struct dirhash *dh, int slot) { int i; DIRHASH_ASSERT_LOCKED(dh); /* Mark the entry as deleted. */ DH_ENTRY(dh, slot) = DIRHASH_DEL; /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen); if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) { DH_ENTRY(dh, i) = DIRHASH_EMPTY; dh->dh_hused--; i = WRAPDECR(i, dh->dh_hlen); } KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); } } /* * Given a directory entry and its offset, find the offset of the * previous entry in the same DIRBLKSIZ-sized block. Returns an * offset, or -1 if there is no previous entry in the block or some * other problem occurred. */ static doff_t ufsdirhash_getprev(struct direct *dirp, doff_t offset) { struct direct *dp; char *blkbuf; doff_t blkoff, prevoff; int entrypos, i; blkoff = rounddown2(offset, DIRBLKSIZ); /* offset of start of block */ entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ blkbuf = (char *)dirp - entrypos; prevoff = blkoff; /* If `offset' is the start of a block, there is no previous entry. */ if (entrypos == 0) return (-1); /* Scan from the start of the block until we get to the entry. */ for (i = 0; i < entrypos; i += dp->d_reclen) { dp = (struct direct *)(blkbuf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) return (-1); /* Corrupted directory. */ prevoff = blkoff + i; } return (prevoff); } /* * Delete the given dirhash and reclaim its memory. Assumes that * ufsdirhash_list is locked, and leaves it locked. Also assumes * that dh is locked. Returns the amount of memory freed. */ static int ufsdirhash_destroy(struct dirhash *dh) { doff_t **hash; uint8_t *blkfree; int i, mem, narrays; KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); /* Remove it from the list and detach its memory. */ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 0; hash = dh->dh_hash; dh->dh_hash = NULL; blkfree = dh->dh_blkfree; dh->dh_blkfree = NULL; narrays = dh->dh_narrays; mem = dh->dh_memreq; dh->dh_memreq = 0; /* Unlock dirhash and free the detached memory. */ ufsdirhash_release(dh); for (i = 0; i < narrays; i++) DIRHASH_BLKFREE(hash[i]); free(hash, M_DIRHASH); free(blkfree, M_DIRHASH); /* Account for the returned memory. */ ufs_dirhashmem -= mem; return (mem); } /* * Try to free up `wanted' bytes by stealing memory from existing * dirhashes. Returns zero with list locked if successful. */ static int ufsdirhash_recycle(int wanted) { struct dirhash *dh; DIRHASHLIST_LOCK(); dh = TAILQ_FIRST(&ufsdirhash_list); while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { /* Decrement the score; only recycle if it becomes zero. */ if (dh == NULL || --dh->dh_score > 0) { DIRHASHLIST_UNLOCK(); return (-1); } /* * If we can't lock it it's in use and we don't want to * recycle it anyway. */ if (!sx_try_xlock(&dh->dh_lock)) { dh = TAILQ_NEXT(dh, dh_list); continue; } ufsdirhash_destroy(dh); /* Repeat if necessary. */ dh = TAILQ_FIRST(&ufsdirhash_list); } /* Success; return with list locked. */ return (0); } /* * Callback that frees some dirhashes when the system is low on virtual memory. */ static void -ufsdirhash_lowmem(void) +ufsdirhash_lowmem(void *arg __unused, int flags __unused) { struct dirhash *dh, *dh_temp; int memfreed, memwanted; ufs_dirhashlowmemcount++; memfreed = 0; memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100; DIRHASHLIST_LOCK(); /* * Reclaim up to memwanted from the oldest dirhashes. This will allow * us to make some progress when the system is running out of memory * without compromising the dinamicity of maximum age. If the situation * does not improve lowmem will be eventually retriggered and free some * other entry in the cache. The entries on the head of the list should * be the oldest. If during list traversal we can't get a lock on the * dirhash, it will be skipped. */ TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) { if (sx_try_xlock(&dh->dh_lock)) memfreed += ufsdirhash_destroy(dh); if (memfreed >= memwanted) break; } DIRHASHLIST_UNLOCK(); } static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS) { int error, v; v = ufs_dirhashreclaimpercent; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == ufs_dirhashreclaimpercent) return (0); /* Refuse invalid percentages */ if (v < 0 || v > 100) return (EINVAL); ufs_dirhashreclaimpercent = v; return (0); } void ufsdirhash_init(void) { ufs_dirhashmaxmem = lmax(roundup(hibufspace / 64, PAGE_SIZE), 2 * 1024 * 1024); ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF); TAILQ_INIT(&ufsdirhash_list); /* Register a callback function to handle low memory signals */ EVENTHANDLER_REGISTER(vm_lowmem, ufsdirhash_lowmem, NULL, EVENTHANDLER_PRI_FIRST); } void ufsdirhash_uninit(void) { KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit")); uma_zdestroy(ufsdirhash_zone); mtx_destroy(&ufsdirhash_mtx); } #endif /* UFS_DIRHASH */