Index: head/sys/kern/kern_malloc.c =================================================================== --- head/sys/kern/kern_malloc.c (revision 353595) +++ head/sys/kern/kern_malloc.c (revision 353596) @@ -1,1373 +1,1377 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1987, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2005-2009 Robert N. M. Watson * Copyright (c) 2008 Otto Moerbeek (mallocarray) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 */ /* * Kernel malloc(9) implementation -- general purpose kernel memory allocator * based on memory types. Back end is implemented using the UMA(9) zone * allocator. A set of fixed-size buckets are used for smaller allocations, * and a special UMA allocation interface is used for larger allocations. * Callers declare memory types, and statistics are maintained independently * for each memory type. Statistics are maintained per-CPU for performance * reasons. See malloc(9) and comments in malloc.h for a detailed * description. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #ifdef DEBUG_REDZONE #include #endif #if defined(INVARIANTS) && defined(__i386__) #include #endif #include #ifdef KDTRACE_HOOKS #include bool __read_frequently dtrace_malloc_enabled; dtrace_malloc_probe_func_t __read_mostly dtrace_malloc_probe; #endif #if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) || \ defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE) #define MALLOC_DEBUG 1 #endif /* * When realloc() is called, if the new size is sufficiently smaller than * the old size, realloc() will allocate a new, smaller block to avoid * wasting memory. 'Sufficiently smaller' is defined as: newsize <= * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'. */ #ifndef REALLOC_FRACTION #define REALLOC_FRACTION 1 /* new block if <= half the size */ #endif /* * Centrally define some common malloc types. */ MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches"); MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory"); MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers"); static struct malloc_type *kmemstatistics; static int kmemcount; #define KMEM_ZSHIFT 4 #define KMEM_ZBASE 16 #define KMEM_ZMASK (KMEM_ZBASE - 1) #define KMEM_ZMAX 65536 #define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT) static uint8_t kmemsize[KMEM_ZSIZE + 1]; #ifndef MALLOC_DEBUG_MAXZONES #define MALLOC_DEBUG_MAXZONES 1 #endif static int numzones = MALLOC_DEBUG_MAXZONES; /* * Small malloc(9) memory allocations are allocated from a set of UMA buckets * of various sizes. * * XXX: The comment here used to read "These won't be powers of two for * long." It's possible that a significant amount of wasted memory could be * recovered by tuning the sizes of these buckets. */ struct { int kz_size; char *kz_name; uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES]; } kmemzones[] = { {16, "16", }, {32, "32", }, {64, "64", }, {128, "128", }, {256, "256", }, {512, "512", }, {1024, "1024", }, {2048, "2048", }, {4096, "4096", }, {8192, "8192", }, {16384, "16384", }, {32768, "32768", }, {65536, "65536", }, {0, NULL}, }; /* * Zone to allocate malloc type descriptions from. For ABI reasons, memory * types are described by a data structure passed by the declaring code, but * the malloc(9) implementation has its own data structure describing the * type and statistics. This permits the malloc(9)-internal data structures * to be modified without breaking binary-compiled kernel modules that * declare malloc types. */ static uma_zone_t mt_zone; static uma_zone_t mt_stats_zone; u_long vm_kmem_size; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0, "Size of kernel memory"); static u_long kmem_zmax = KMEM_ZMAX; SYSCTL_ULONG(_vm, OID_AUTO, kmem_zmax, CTLFLAG_RDTUN, &kmem_zmax, 0, "Maximum allocation size that malloc(9) would use UMA as backend"); static u_long vm_kmem_size_min; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0, "Minimum size of kernel memory"); static u_long vm_kmem_size_max; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0, "Maximum size of kernel memory"); static u_int vm_kmem_size_scale; SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0, "Scale factor for kernel memory size"); static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size, CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmem_map_size, "LU", "Current kmem allocation size"); static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free, CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmem_map_free, "LU", "Free space in kmem"); /* * The malloc_mtx protects the kmemstatistics linked list. */ struct mtx malloc_mtx; #ifdef MALLOC_PROFILE uint64_t krequests[KMEM_ZSIZE + 1]; static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS); #endif static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS); /* * time_uptime of the last malloc(9) failure (induced or real). */ static time_t t_malloc_fail; #if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1) static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0, "Kernel malloc debugging options"); #endif /* * malloc(9) fault injection -- cause malloc failures every (n) mallocs when * the caller specifies M_NOWAIT. If set to 0, no failures are caused. */ #ifdef MALLOC_MAKE_FAILURES static int malloc_failure_rate; static int malloc_nowait_count; static int malloc_failure_count; SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RWTUN, &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail"); SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD, &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures"); #endif static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS) { u_long size; size = uma_size(); return (sysctl_handle_long(oidp, &size, 0, req)); } static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS) { u_long size, limit; /* The sysctl is unsigned, implement as a saturation value. */ size = uma_size(); limit = uma_limit(); if (size > limit) size = 0; else size = limit - size; return (sysctl_handle_long(oidp, &size, 0, req)); } /* * malloc(9) uma zone separation -- sub-page buffer overruns in one * malloc type will affect only a subset of other malloc types. */ #if MALLOC_DEBUG_MAXZONES > 1 static void tunable_set_numzones(void) { TUNABLE_INT_FETCH("debug.malloc.numzones", &numzones); /* Sanity check the number of malloc uma zones. */ if (numzones <= 0) numzones = 1; if (numzones > MALLOC_DEBUG_MAXZONES) numzones = MALLOC_DEBUG_MAXZONES; } SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL); SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &numzones, 0, "Number of malloc uma subzones"); /* * Any number that changes regularly is an okay choice for the * offset. Build numbers are pretty good of you have them. */ static u_int zone_offset = __FreeBSD_version; TUNABLE_INT("debug.malloc.zone_offset", &zone_offset); SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN, &zone_offset, 0, "Separate malloc types by examining the " "Nth character in the malloc type short description."); static void mtp_set_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; const char *desc; size_t len; u_int val; mtip = mtp->ks_handle; desc = mtp->ks_shortdesc; if (desc == NULL || (len = strlen(desc)) == 0) val = 0; else val = desc[zone_offset % len]; mtip->mti_zone = (val % numzones); } static inline u_int mtp_get_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; mtip = mtp->ks_handle; KASSERT(mtip->mti_zone < numzones, ("mti_zone %u out of range %d", mtip->mti_zone, numzones)); return (mtip->mti_zone); } #elif MALLOC_DEBUG_MAXZONES == 0 #error "MALLOC_DEBUG_MAXZONES must be positive." #else static void mtp_set_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; mtip = mtp->ks_handle; mtip->mti_zone = 0; } static inline u_int mtp_get_subzone(struct malloc_type *mtp) { return (0); } #endif /* MALLOC_DEBUG_MAXZONES > 1 */ int malloc_last_fail(void) { return (time_uptime - t_malloc_fail); } /* * An allocation has succeeded -- update malloc type statistics for the * amount of bucket size. Occurs within a critical section so that the * thread isn't preempted and doesn't migrate while updating per-PCU * statistics. */ static void malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size, int zindx) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; critical_enter(); mtip = mtp->ks_handle; mtsp = zpcpu_get(mtip->mti_stats); if (size > 0) { mtsp->mts_memalloced += size; mtsp->mts_numallocs++; } if (zindx != -1) mtsp->mts_size |= 1 << zindx; #ifdef KDTRACE_HOOKS if (__predict_false(dtrace_malloc_enabled)) { uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC]; if (probe_id != 0) (dtrace_malloc_probe)(probe_id, (uintptr_t) mtp, (uintptr_t) mtip, (uintptr_t) mtsp, size, zindx); } #endif critical_exit(); } void malloc_type_allocated(struct malloc_type *mtp, unsigned long size) { if (size > 0) malloc_type_zone_allocated(mtp, size, -1); } /* * A free operation has occurred -- update malloc type statistics for the * amount of the bucket size. Occurs within a critical section so that the * thread isn't preempted and doesn't migrate while updating per-CPU * statistics. */ void malloc_type_freed(struct malloc_type *mtp, unsigned long size) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; critical_enter(); mtip = mtp->ks_handle; mtsp = zpcpu_get(mtip->mti_stats); mtsp->mts_memfreed += size; mtsp->mts_numfrees++; #ifdef KDTRACE_HOOKS if (__predict_false(dtrace_malloc_enabled)) { uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE]; if (probe_id != 0) (dtrace_malloc_probe)(probe_id, (uintptr_t) mtp, (uintptr_t) mtip, (uintptr_t) mtsp, size, 0); } #endif critical_exit(); } /* * contigmalloc: * * Allocate a block of physically contiguous memory. * * If M_NOWAIT is set, this routine will not block and return NULL if * the allocation fails. */ void * contigmalloc(unsigned long size, struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) { void *ret; ret = (void *)kmem_alloc_contig(size, flags, low, high, alignment, boundary, VM_MEMATTR_DEFAULT); if (ret != NULL) malloc_type_allocated(type, round_page(size)); return (ret); } void * contigmalloc_domainset(unsigned long size, struct malloc_type *type, struct domainset *ds, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) { void *ret; ret = (void *)kmem_alloc_contig_domainset(ds, size, flags, low, high, alignment, boundary, VM_MEMATTR_DEFAULT); if (ret != NULL) malloc_type_allocated(type, round_page(size)); return (ret); } /* * contigfree: * * Free a block of memory allocated by contigmalloc. * * This routine may not block. */ void contigfree(void *addr, unsigned long size, struct malloc_type *type) { kmem_free((vm_offset_t)addr, size); malloc_type_freed(type, round_page(size)); } #ifdef MALLOC_DEBUG static int malloc_dbg(caddr_t *vap, size_t *sizep, struct malloc_type *mtp, int flags) { #ifdef INVARIANTS int indx; KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic")); /* * Check that exactly one of M_WAITOK or M_NOWAIT is specified. */ indx = flags & (M_WAITOK | M_NOWAIT); if (indx != M_NOWAIT && indx != M_WAITOK) { static struct timeval lasterr; static int curerr, once; if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) { printf("Bad malloc flags: %x\n", indx); kdb_backtrace(); flags |= M_WAITOK; once++; } } #endif #ifdef MALLOC_MAKE_FAILURES if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) { atomic_add_int(&malloc_nowait_count, 1); if ((malloc_nowait_count % malloc_failure_rate) == 0) { atomic_add_int(&malloc_failure_count, 1); t_malloc_fail = time_uptime; *vap = NULL; return (EJUSTRETURN); } } #endif if (flags & M_WAITOK) { KASSERT(curthread->td_intr_nesting_level == 0, ("malloc(M_WAITOK) in interrupt context")); +#ifdef EPOCH_TRACE + if (__predict_false(curthread->td_epochnest > 0)) + epoch_trace_list(curthread); +#endif KASSERT(curthread->td_epochnest == 0, ("malloc(M_WAITOK) in epoch context")); } KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("malloc: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if (memguard_cmp_mtp(mtp, *sizep)) { *vap = memguard_alloc(*sizep, flags); if (*vap != NULL) return (EJUSTRETURN); /* This is unfortunate but should not be fatal. */ } #endif #ifdef DEBUG_REDZONE *sizep = redzone_size_ntor(*sizep); #endif return (0); } #endif /* * malloc: * * Allocate a block of memory. * * If M_NOWAIT is set, this routine will not block and return NULL if * the allocation fails. */ void * (malloc)(size_t size, struct malloc_type *mtp, int flags) { int indx; caddr_t va; uma_zone_t zone; #if defined(DEBUG_REDZONE) unsigned long osize = size; #endif #ifdef MALLOC_DEBUG va = NULL; if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif if (size <= kmem_zmax && (flags & M_EXEC) == 0) { if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)]; #ifdef MALLOC_PROFILE krequests[size >> KMEM_ZSHIFT]++; #endif va = uma_zalloc(zone, flags); if (va != NULL) size = zone->uz_size; malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); } else { size = roundup(size, PAGE_SIZE); zone = NULL; va = uma_large_malloc(size, flags); malloc_type_allocated(mtp, va == NULL ? 0 : size); } if (flags & M_WAITOK) KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); else if (va == NULL) t_malloc_fail = time_uptime; #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); #endif return ((void *) va); } static void * malloc_domain(size_t size, struct malloc_type *mtp, int domain, int flags) { int indx; caddr_t va; uma_zone_t zone; #if defined(DEBUG_REDZONE) unsigned long osize = size; #endif #ifdef MALLOC_DEBUG va = NULL; if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif if (size <= kmem_zmax && (flags & M_EXEC) == 0) { if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)]; #ifdef MALLOC_PROFILE krequests[size >> KMEM_ZSHIFT]++; #endif va = uma_zalloc_domain(zone, NULL, domain, flags); if (va != NULL) size = zone->uz_size; malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); } else { size = roundup(size, PAGE_SIZE); zone = NULL; va = uma_large_malloc_domain(size, domain, flags); malloc_type_allocated(mtp, va == NULL ? 0 : size); } if (flags & M_WAITOK) KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); else if (va == NULL) t_malloc_fail = time_uptime; #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); #endif return ((void *) va); } void * malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds, int flags) { struct vm_domainset_iter di; void *ret; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { ret = malloc_domain(size, mtp, domain, flags); if (ret != NULL) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (ret); } void * mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) { if (WOULD_OVERFLOW(nmemb, size)) panic("mallocarray: %zu * %zu overflowed", nmemb, size); return (malloc(size * nmemb, type, flags)); } #ifdef INVARIANTS static void free_save_type(void *addr, struct malloc_type *mtp, u_long size) { struct malloc_type **mtpp = addr; /* * Cache a pointer to the malloc_type that most recently freed * this memory here. This way we know who is most likely to * have stepped on it later. * * This code assumes that size is a multiple of 8 bytes for * 64 bit machines */ mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR); mtpp += (size - sizeof(struct malloc_type *)) / sizeof(struct malloc_type *); *mtpp = mtp; } #endif #ifdef MALLOC_DEBUG static int free_dbg(void **addrp, struct malloc_type *mtp) { void *addr; addr = *addrp; KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("free: called with spinlock or critical section held")); /* free(NULL, ...) does nothing */ if (addr == NULL) return (EJUSTRETURN); #ifdef DEBUG_MEMGUARD if (is_memguard_addr(addr)) { memguard_free(addr); return (EJUSTRETURN); } #endif #ifdef DEBUG_REDZONE redzone_check(addr); *addrp = redzone_addr_ntor(addr); #endif return (0); } #endif /* * free: * * Free a block of memory allocated by malloc. * * This routine may not block. */ void free(void *addr, struct malloc_type *mtp) { uma_slab_t slab; u_long size; #ifdef MALLOC_DEBUG if (free_dbg(&addr, mtp) != 0) return; #endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); if (slab == NULL) panic("free: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (!(slab->us_flags & UMA_SLAB_MALLOC)) { size = slab->us_keg->uk_size; #ifdef INVARIANTS free_save_type(addr, mtp, size); #endif uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab); } else { size = slab->us_size; uma_large_free(slab); } malloc_type_freed(mtp, size); } void free_domain(void *addr, struct malloc_type *mtp) { uma_slab_t slab; u_long size; #ifdef MALLOC_DEBUG if (free_dbg(&addr, mtp) != 0) return; #endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); if (slab == NULL) panic("free_domain: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (!(slab->us_flags & UMA_SLAB_MALLOC)) { size = slab->us_keg->uk_size; #ifdef INVARIANTS free_save_type(addr, mtp, size); #endif uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab); } else { size = slab->us_size; uma_large_free(slab); } malloc_type_freed(mtp, size); } /* * realloc: change the size of a memory block */ void * realloc(void *addr, size_t size, struct malloc_type *mtp, int flags) { uma_slab_t slab; unsigned long alloc; void *newaddr; KASSERT(mtp->ks_magic == M_MAGIC, ("realloc: bad malloc type magic")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("realloc: called with spinlock or critical section held")); /* realloc(NULL, ...) is equivalent to malloc(...) */ if (addr == NULL) return (malloc(size, mtp, flags)); /* * XXX: Should report free of old memory and alloc of new memory to * per-CPU stats. */ #ifdef DEBUG_MEMGUARD if (is_memguard_addr(addr)) return (memguard_realloc(addr, size, mtp, flags)); #endif #ifdef DEBUG_REDZONE slab = NULL; alloc = redzone_get_size(addr); #else slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK)); /* Sanity check */ KASSERT(slab != NULL, ("realloc: address %p out of range", (void *)addr)); /* Get the size of the original block */ if (!(slab->us_flags & UMA_SLAB_MALLOC)) alloc = slab->us_keg->uk_size; else alloc = slab->us_size; /* Reuse the original block if appropriate */ if (size <= alloc && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE)) return (addr); #endif /* !DEBUG_REDZONE */ /* Allocate a new, bigger (or smaller) block */ if ((newaddr = malloc(size, mtp, flags)) == NULL) return (NULL); /* Copy over original contents */ bcopy(addr, newaddr, min(size, alloc)); free(addr, mtp); return (newaddr); } /* * reallocf: same as realloc() but free memory on failure. */ void * reallocf(void *addr, size_t size, struct malloc_type *mtp, int flags) { void *mem; if ((mem = realloc(addr, size, mtp, flags)) == NULL) free(addr, mtp); return (mem); } #ifndef __sparc64__ CTASSERT(VM_KMEM_SIZE_SCALE >= 1); #endif /* * Initialize the kernel memory (kmem) arena. */ void kmeminit(void) { u_long mem_size; u_long tmp; #ifdef VM_KMEM_SIZE if (vm_kmem_size == 0) vm_kmem_size = VM_KMEM_SIZE; #endif #ifdef VM_KMEM_SIZE_MIN if (vm_kmem_size_min == 0) vm_kmem_size_min = VM_KMEM_SIZE_MIN; #endif #ifdef VM_KMEM_SIZE_MAX if (vm_kmem_size_max == 0) vm_kmem_size_max = VM_KMEM_SIZE_MAX; #endif /* * Calculate the amount of kernel virtual address (KVA) space that is * preallocated to the kmem arena. In order to support a wide range * of machines, it is a function of the physical memory size, * specifically, * * min(max(physical memory size / VM_KMEM_SIZE_SCALE, * VM_KMEM_SIZE_MIN), VM_KMEM_SIZE_MAX) * * Every architecture must define an integral value for * VM_KMEM_SIZE_SCALE. However, the definitions of VM_KMEM_SIZE_MIN * and VM_KMEM_SIZE_MAX, which represent respectively the floor and * ceiling on this preallocation, are optional. Typically, * VM_KMEM_SIZE_MAX is itself a function of the available KVA space on * a given architecture. */ mem_size = vm_cnt.v_page_count; if (mem_size <= 32768) /* delphij XXX 128MB */ kmem_zmax = PAGE_SIZE; if (vm_kmem_size_scale < 1) vm_kmem_size_scale = VM_KMEM_SIZE_SCALE; /* * Check if we should use defaults for the "vm_kmem_size" * variable: */ if (vm_kmem_size == 0) { vm_kmem_size = mem_size / vm_kmem_size_scale; vm_kmem_size = vm_kmem_size * PAGE_SIZE < vm_kmem_size ? vm_kmem_size_max : vm_kmem_size * PAGE_SIZE; if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) vm_kmem_size = vm_kmem_size_min; if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max) vm_kmem_size = vm_kmem_size_max; } if (vm_kmem_size == 0) panic("Tune VM_KMEM_SIZE_* for the platform"); /* * The amount of KVA space that is preallocated to the * kmem arena can be set statically at compile-time or manually * through the kernel environment. However, it is still limited to * twice the physical memory size, which has been sufficient to handle * the most severe cases of external fragmentation in the kmem arena. */ if (vm_kmem_size / 2 / PAGE_SIZE > mem_size) vm_kmem_size = 2 * mem_size * PAGE_SIZE; vm_kmem_size = round_page(vm_kmem_size); #ifdef DEBUG_MEMGUARD tmp = memguard_fudge(vm_kmem_size, kernel_map); #else tmp = vm_kmem_size; #endif uma_set_limit(tmp); #ifdef DEBUG_MEMGUARD /* * Initialize MemGuard if support compiled in. MemGuard is a * replacement allocator used for detecting tamper-after-free * scenarios as they occur. It is only used for debugging. */ memguard_init(kernel_arena); #endif } /* * Initialize the kernel memory allocator */ /* ARGSUSED*/ static void mallocinit(void *dummy) { int i; uint8_t indx; mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF); kmeminit(); if (kmem_zmax < PAGE_SIZE || kmem_zmax > KMEM_ZMAX) kmem_zmax = KMEM_ZMAX; mt_stats_zone = uma_zcreate("mt_stats_zone", sizeof(struct malloc_type_stats), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal), #ifdef INVARIANTS mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, #else NULL, NULL, NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_MALLOC); for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) { int size = kmemzones[indx].kz_size; char *name = kmemzones[indx].kz_name; int subzone; for (subzone = 0; subzone < numzones; subzone++) { kmemzones[indx].kz_zone[subzone] = uma_zcreate(name, size, #ifdef INVARIANTS mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, #else NULL, NULL, NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_MALLOC); } for (;i <= size; i+= KMEM_ZBASE) kmemsize[i >> KMEM_ZSHIFT] = indx; } } SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_SECOND, mallocinit, NULL); void malloc_init(void *data) { struct malloc_type_internal *mtip; struct malloc_type *mtp; KASSERT(vm_cnt.v_page_count != 0, ("malloc_register before vm_init")); mtp = data; if (mtp->ks_magic != M_MAGIC) panic("malloc_init: bad malloc type magic"); mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO); mtip->mti_stats = uma_zalloc_pcpu(mt_stats_zone, M_WAITOK | M_ZERO); mtp->ks_handle = mtip; mtp_set_subzone(mtp); mtx_lock(&malloc_mtx); mtp->ks_next = kmemstatistics; kmemstatistics = mtp; kmemcount++; mtx_unlock(&malloc_mtx); } void malloc_uninit(void *data) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; struct malloc_type *mtp, *temp; uma_slab_t slab; long temp_allocs, temp_bytes; int i; mtp = data; KASSERT(mtp->ks_magic == M_MAGIC, ("malloc_uninit: bad malloc type magic")); KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL")); mtx_lock(&malloc_mtx); mtip = mtp->ks_handle; mtp->ks_handle = NULL; if (mtp != kmemstatistics) { for (temp = kmemstatistics; temp != NULL; temp = temp->ks_next) { if (temp->ks_next == mtp) { temp->ks_next = mtp->ks_next; break; } } KASSERT(temp, ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc)); } else kmemstatistics = mtp->ks_next; kmemcount--; mtx_unlock(&malloc_mtx); /* * Look for memory leaks. */ temp_allocs = temp_bytes = 0; for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); temp_allocs += mtsp->mts_numallocs; temp_allocs -= mtsp->mts_numfrees; temp_bytes += mtsp->mts_memalloced; temp_bytes -= mtsp->mts_memfreed; } if (temp_allocs > 0 || temp_bytes > 0) { printf("Warning: memory type %s leaked memory on destroy " "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc, temp_allocs, temp_bytes); } slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK)); uma_zfree_pcpu(mt_stats_zone, mtip->mti_stats); uma_zfree_arg(mt_zone, mtip, slab); } struct malloc_type * malloc_desc2type(const char *desc) { struct malloc_type *mtp; mtx_assert(&malloc_mtx, MA_OWNED); for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { if (strcmp(mtp->ks_shortdesc, desc) == 0) return (mtp); } return (NULL); } static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) { struct malloc_type_stream_header mtsh; struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp, zeromts; struct malloc_type_header mth; struct malloc_type *mtp; int error, i; struct sbuf sbuf; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); mtx_lock(&malloc_mtx); bzero(&zeromts, sizeof(zeromts)); /* * Insert stream header. */ bzero(&mtsh, sizeof(mtsh)); mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION; mtsh.mtsh_maxcpus = MAXCPU; mtsh.mtsh_count = kmemcount; (void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh)); /* * Insert alternating sequence of type headers and type statistics. */ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { mtip = (struct malloc_type_internal *)mtp->ks_handle; /* * Insert type header. */ bzero(&mth, sizeof(mth)); strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME); (void)sbuf_bcat(&sbuf, &mth, sizeof(mth)); /* * Insert type statistics for each CPU. */ for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); (void)sbuf_bcat(&sbuf, mtsp, sizeof(*mtsp)); } /* * Fill in the missing CPUs. */ for (; i < MAXCPU; i++) { (void)sbuf_bcat(&sbuf, &zeromts, sizeof(zeromts)); } } mtx_unlock(&malloc_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats", "Return malloc types"); SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0, "Count of kernel malloc types"); void malloc_type_list(malloc_type_list_func_t *func, void *arg) { struct malloc_type *mtp, **bufmtp; int count, i; size_t buflen; mtx_lock(&malloc_mtx); restart: mtx_assert(&malloc_mtx, MA_OWNED); count = kmemcount; mtx_unlock(&malloc_mtx); buflen = sizeof(struct malloc_type *) * count; bufmtp = malloc(buflen, M_TEMP, M_WAITOK); mtx_lock(&malloc_mtx); if (count < kmemcount) { free(bufmtp, M_TEMP); goto restart; } for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++) bufmtp[i] = mtp; mtx_unlock(&malloc_mtx); for (i = 0; i < count; i++) (func)(bufmtp[i], arg); free(bufmtp, M_TEMP); } #ifdef DDB static int64_t get_malloc_stats(const struct malloc_type_internal *mtip, uint64_t *allocs, uint64_t *inuse) { const struct malloc_type_stats *mtsp; uint64_t frees, alloced, freed; int i; *allocs = 0; frees = 0; alloced = 0; freed = 0; for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); *allocs += mtsp->mts_numallocs; frees += mtsp->mts_numfrees; alloced += mtsp->mts_memalloced; freed += mtsp->mts_memfreed; } *inuse = *allocs - frees; return (alloced - freed); } DB_SHOW_COMMAND(malloc, db_show_malloc) { const char *fmt_hdr, *fmt_entry; struct malloc_type *mtp; uint64_t allocs, inuse; int64_t size; /* variables for sorting */ struct malloc_type *last_mtype, *cur_mtype; int64_t cur_size, last_size; int ties; if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jdK,%ju\n"; } else { fmt_hdr = "%18s %12s %12s %12s\n"; fmt_entry = "%18s %12ju %12jdK %12ju\n"; } db_printf(fmt_hdr, "Type", "InUse", "MemUse", "Requests"); /* Select sort, largest size first. */ last_mtype = NULL; last_size = INT64_MAX; for (;;) { cur_mtype = NULL; cur_size = -1; ties = 0; for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { /* * In the case of size ties, print out mtypes * in the order they are encountered. That is, * when we encounter the most recently output * mtype, we have already printed all preceding * ties, and we must print all following ties. */ if (mtp == last_mtype) { ties = 1; continue; } size = get_malloc_stats(mtp->ks_handle, &allocs, &inuse); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_mtype = mtp; } } if (cur_mtype == NULL) break; size = get_malloc_stats(cur_mtype->ks_handle, &allocs, &inuse); db_printf(fmt_entry, cur_mtype->ks_shortdesc, inuse, howmany(size, 1024), allocs); if (db_pager_quit) break; last_mtype = cur_mtype; last_size = cur_size; } } #if MALLOC_DEBUG_MAXZONES > 1 DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches) { struct malloc_type_internal *mtip; struct malloc_type *mtp; u_int subzone; if (!have_addr) { db_printf("Usage: show multizone_matches \n"); return; } mtp = (void *)addr; if (mtp->ks_magic != M_MAGIC) { db_printf("Magic %lx does not match expected %x\n", mtp->ks_magic, M_MAGIC); return; } mtip = mtp->ks_handle; subzone = mtip->mti_zone; for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { mtip = mtp->ks_handle; if (mtip->mti_zone != subzone) continue; db_printf("%s\n", mtp->ks_shortdesc); if (db_pager_quit) break; } } #endif /* MALLOC_DEBUG_MAXZONES > 1 */ #endif /* DDB */ #ifdef MALLOC_PROFILE static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; uint64_t count; uint64_t waste; uint64_t mem; int error; int rsize; int size; int i; waste = 0; mem = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n Size Requests Real Size\n"); for (i = 0; i < KMEM_ZSIZE; i++) { size = i << KMEM_ZSHIFT; rsize = kmemzones[kmemsize[i]].kz_size; count = (long long unsigned)krequests[i]; sbuf_printf(&sbuf, "%6d%28llu%11d\n", size, (unsigned long long)count, rsize); if ((rsize * count) > (size * count)) waste += (rsize * count) - (size * count); mem += (rsize * count); } sbuf_printf(&sbuf, "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n", (unsigned long long)mem, (unsigned long long)waste); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling"); #endif /* MALLOC_PROFILE */ Index: head/sys/kern/kern_synch.c =================================================================== --- head/sys/kern/kern_synch.c (revision 353595) +++ head/sys/kern/kern_synch.c (revision 353596) @@ -1,658 +1,662 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static uint8_t pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, ""); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, NULL); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(ident != NULL, ("_sleep: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running")); +#ifdef EPOCH_TRACE + if (__predict_false(curthread->td_epochnest > 0)) + epoch_trace_list(curthread); +#endif KASSERT(td->td_epochnest == 0, ("sleeping in an epoch section")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; if (SCHEDULER_STOPPED_TD(td)) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep")); if ((uint8_t *)ident >= &pause_wchan[0] && (uint8_t *)ident <= &pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } int msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running")); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause_sbt() delays the calling thread by the given signed binary * time. During cold bootup, pause_sbt() uses the DELAY() function * instead of the _sleep() function to do the waiting. The "sbt" * argument must be greater than or equal to zero. A "sbt" value of * zero is equivalent to a "sbt" value of one tick. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause_sbt: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if ((cold && curthread == &thread0) || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (EWOULDBLOCK); } return (_sleep(&pause_wchan[curcpu], NULL, (flags & C_CATCH) ? PCATCH : 0, wmesg, sbt, pr, flags)); } /* * Potentially release the last reference for refcount. Check for * unlikely conditions and signal the caller as to whether it was * the final ref. */ bool refcount_release_last(volatile u_int *count, u_int n, u_int old) { u_int waiter; waiter = old & REFCOUNT_WAITER; old = REFCOUNT_COUNT(old); if (__predict_false(n > old || REFCOUNT_SATURATED(old))) { /* * Avoid multiple destructor invocations if underflow occurred. * This is not perfect since the memory backing the containing * object may already have been reallocated. */ _refcount_update_saturated(count); return (false); } /* * Attempt to atomically clear the waiter bit. Wakeup waiters * if we are successful. */ if (waiter != 0 && atomic_cmpset_int(count, REFCOUNT_WAITER, 0)) wakeup(__DEVOLATILE(u_int *, count)); /* * Last reference. Signal the user to call the destructor. * * Ensure that the destructor sees all updates. The fence_rel * at the start of refcount_releasen synchronizes with this fence. */ atomic_thread_fence_acq(); return (true); } /* * Wait for a refcount wakeup. This does not guarantee that the ref is still * zero on return and may be subject to transient wakeups. Callers wanting * a precise answer should use refcount_wait(). */ void refcount_sleep(volatile u_int *count, const char *wmesg, int pri) { void *wchan; u_int old; if (REFCOUNT_COUNT(*count) == 0) return; wchan = __DEVOLATILE(void *, count); sleepq_lock(wchan); old = *count; for (;;) { if (REFCOUNT_COUNT(old) == 0) { sleepq_release(wchan); return; } if (old & REFCOUNT_WAITER) break; if (atomic_fcmpset_int(count, &old, old | REFCOUNT_WAITER)) break; } sleepq_add(wchan, NULL, wmesg, 0, 0); sleepq_wait(wchan, pri); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } void wakeup_any(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * The machine independent parts of context switching. */ void mi_switch(int flags, struct thread *newtd) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || panicstr, ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); if (SCHEDULER_STOPPED_TD(td)) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ VM_CNT_INC(v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #ifdef KDTRACE_HOOKS if (SDT_PROBES_ENABLED() && ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 && (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED))) SDT_PROBE0(sched, , , preempt); #endif sched_switch(td, newtd, flags); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. */ int setrunnable(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return (0); /* FALLTHROUGH */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((td->td_flags & TDF_INMEM) == 0) { if ((td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; return (1); } } else sched_wakeup(td); return (0); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } /* ARGSUSED */ static void synch_setup(void *dummy) { callout_init(&loadav_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } int should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); td->td_retval[0] = 0; return (0); } Index: head/sys/kern/subr_epoch.c =================================================================== --- head/sys/kern/subr_epoch.c (revision 353595) +++ head/sys/kern/subr_epoch.c (revision 353596) @@ -1,847 +1,858 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EPOCH_TRACE #include #include #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); #ifdef __amd64__ #define EPOCH_ALIGN CACHE_LINE_SIZE*2 #else #define EPOCH_ALIGN CACHE_LINE_SIZE #endif TAILQ_HEAD (epoch_tdlist, epoch_tracker); typedef struct epoch_record { ck_epoch_record_t er_record; struct epoch_context er_drain_ctx; struct epoch *er_parent; volatile struct epoch_tdlist er_tdlist; volatile uint32_t er_gen; uint32_t er_cpuid; } __aligned(EPOCH_ALIGN) *epoch_record_t; struct epoch { struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); epoch_record_t e_pcpu_record; int e_idx; int e_flags; struct sx e_drain_sx; struct mtx e_drain_mtx; volatile int e_drain_count; const char *e_name; }; /* arbitrary --- needs benchmarking */ #define MAX_ADAPTIVE_SPIN 100 #define MAX_EPOCHS 64 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); /* Stats. */ static counter_u64_t block_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, &block_count, "# of times a thread was in an epoch when epoch_wait was called"); static counter_u64_t migrate_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); static counter_u64_t turnstile_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); static counter_u64_t switch_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); static counter_u64_t epoch_call_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, &epoch_call_count, "# of times a callback was deferred"); static counter_u64_t epoch_call_task_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, &epoch_call_task_count, "# of times a callback task was run"); TAILQ_HEAD (threadlist, thread); CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) epoch_t allepochs[MAX_EPOCHS]; DPCPU_DEFINE(struct grouptask, epoch_cb_task); DPCPU_DEFINE(int, epoch_cb_count); static __read_mostly int inited; static __read_mostly int epoch_count; __read_mostly epoch_t global_epoch; __read_mostly epoch_t global_epoch_preempt; static void epoch_call_task(void *context __unused); static uma_zone_t pcpu_zone_record; #ifdef EPOCH_TRACE struct stackentry { RB_ENTRY(stackentry) se_node; struct stack se_stack; }; static int stackentry_compare(struct stackentry *a, struct stackentry *b) { if (a->se_stack.depth > b->se_stack.depth) return (1); if (a->se_stack.depth < b->se_stack.depth) return (-1); for (int i = 0; i < a->se_stack.depth; i++) { if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) return (1); if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) return (-1); } return (0); } RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); static struct mtx epoch_stacks_lock; MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); static inline void epoch_trace_report(const char *fmt, ...) { va_list ap; struct stackentry se, *new; stack_zero(&se.se_stack); /* XXX: is it really needed? */ stack_save(&se.se_stack); /* Tree is never reduced - go lockless. */ if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) return; new = malloc(sizeof(*new), M_STACK, M_NOWAIT); if (new != NULL) { bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); mtx_lock(&epoch_stacks_lock); new = RB_INSERT(stacktree, &epoch_stacks, new); mtx_unlock(&epoch_stacks_lock); if (new != NULL) free(new, M_STACK); } va_start(ap, fmt); (void)vprintf(fmt, ap); va_end(ap); stack_print_ddb(&se.se_stack); } static inline void epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, const char *file, int line) { epoch_tracker_t iet; SLIST_FOREACH(iet, &td->td_epochs, et_tlink) if (iet->et_epoch == epoch) epoch_trace_report("Recursively entering epoch %s " "previously entered at %s:%d\n", epoch->e_name, iet->et_file, iet->et_line); et->et_epoch = epoch; et->et_file = file; et->et_line = line; SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); } static inline void epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, const char *file, int line) { if (SLIST_FIRST(&td->td_epochs) != et) { epoch_trace_report("Exiting epoch %s in a not nested order. " "Most recently entered %s at %s:%d\n", epoch->e_name, SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, SLIST_FIRST(&td->td_epochs)->et_file, SLIST_FIRST(&td->td_epochs)->et_line); /* This will panic if et is not anywhere on td_epochs. */ SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); } else SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); } + +/* Used by assertions that check thread state before going to sleep. */ +void +epoch_trace_list(struct thread *td) +{ + epoch_tracker_t iet; + + SLIST_FOREACH(iet, &td->td_epochs, et_tlink) + printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name, + iet->et_file, iet->et_line); +} #endif /* EPOCH_TRACE */ static void epoch_init(void *arg __unused) { int cpu; block_count = counter_u64_alloc(M_WAITOK); migrate_count = counter_u64_alloc(M_WAITOK); turnstile_count = counter_u64_alloc(M_WAITOK); switch_count = counter_u64_alloc(M_WAITOK); epoch_call_count = counter_u64_alloc(M_WAITOK); epoch_call_task_count = counter_u64_alloc(M_WAITOK); pcpu_zone_record = uma_zcreate("epoch_record pcpu", sizeof(struct epoch_record), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); CPU_FOREACH(cpu) { GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, epoch_call_task, NULL); taskqgroup_attach_cpu(qgroup_softirq, DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, "epoch call task"); } #ifdef EPOCH_TRACE SLIST_INIT(&thread0.td_epochs); #endif inited = 1; global_epoch = epoch_alloc("Global", 0); global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); } SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL); #if !defined(EARLY_AP_STARTUP) static void epoch_init_smp(void *dummy __unused) { inited = 2; } SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); #endif static void epoch_ctor(epoch_t epoch) { epoch_record_t er; int cpu; epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); bzero(er, sizeof(*er)); ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = cpu; er->er_parent = epoch; } } static void epoch_adjust_prio(struct thread *td, u_char prio) { thread_lock(td); sched_prio(td, prio); thread_unlock(td); } epoch_t epoch_alloc(const char *name, int flags) { epoch_t epoch; if (__predict_false(!inited)) panic("%s called too early in boot", __func__); epoch = malloc(sizeof(struct epoch), M_EPOCH, M_ZERO | M_WAITOK); ck_epoch_init(&epoch->e_epoch); epoch_ctor(epoch); MPASS(epoch_count < MAX_EPOCHS - 2); epoch->e_flags = flags; epoch->e_idx = epoch_count; epoch->e_name = name; sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); allepochs[epoch_count++] = epoch; return (epoch); } void epoch_free(epoch_t epoch) { epoch_drain_callbacks(epoch); allepochs[epoch->e_idx] = NULL; epoch_wait(global_epoch); uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); mtx_destroy(&epoch->e_drain_mtx); sx_destroy(&epoch->e_drain_sx); free(epoch, M_EPOCH); } static epoch_record_t epoch_currecord(epoch_t epoch) { return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu)); } #define INIT_CHECK(epoch) \ do { \ if (__predict_false((epoch) == NULL)) \ return; \ } while (0) void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); MPASS(epoch->e_flags & EPOCH_PREEMPT); td = curthread; #ifdef EPOCH_TRACE epoch_trace_enter(td, epoch, et, file, line); #endif et->et_td = td; td->td_epochnest++; critical_enter(); sched_pin(); td->td_pre_epoch_prio = td->td_priority; er = epoch_currecord(epoch); TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); ck_epoch_begin(&er->er_record, &et->et_section); critical_exit(); } void epoch_enter(epoch_t epoch) { struct thread *td; epoch_record_t er; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); td = curthread; td->td_epochnest++; critical_enter(); er = epoch_currecord(epoch); ck_epoch_begin(&er->er_record, NULL); } void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; INIT_CHECK(epoch); td = curthread; critical_enter(); sched_unpin(); MPASS(td->td_epochnest); td->td_epochnest--; er = epoch_currecord(epoch); MPASS(epoch->e_flags & EPOCH_PREEMPT); MPASS(et != NULL); MPASS(et->et_td == td); #ifdef INVARIANTS et->et_td = (void*)0xDEADBEEF; #endif ck_epoch_end(&er->er_record, &et->et_section); TAILQ_REMOVE(&er->er_tdlist, et, et_link); er->er_gen++; if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) epoch_adjust_prio(td, td->td_pre_epoch_prio); critical_exit(); #ifdef EPOCH_TRACE epoch_trace_exit(td, epoch, et, file, line); #endif } void epoch_exit(epoch_t epoch) { struct thread *td; epoch_record_t er; INIT_CHECK(epoch); td = curthread; MPASS(td->td_epochnest); td->td_epochnest--; er = epoch_currecord(epoch); ck_epoch_end(&er->er_record, NULL); critical_exit(); } /* * epoch_block_handler_preempt() is a callback from the CK code when another * thread is currently in an epoch section. */ static void epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr, void *arg __unused) { epoch_record_t record; struct thread *td, *owner, *curwaittd; struct epoch_tracker *tdwait; struct turnstile *ts; struct lock_object *lock; int spincount, gen; int locksheld __unused; record = __containerof(cr, struct epoch_record, er_record); td = curthread; locksheld = td->td_locks; spincount = 0; counter_u64_add(block_count, 1); /* * We lost a race and there's no longer any threads * on the CPU in an epoch section. */ if (TAILQ_EMPTY(&record->er_tdlist)) return; if (record->er_cpuid != curcpu) { /* * If the head of the list is running, we can wait for it * to remove itself from the list and thus save us the * overhead of a migration */ gen = record->er_gen; thread_unlock(td); /* * We can't actually check if the waiting thread is running * so we simply poll for it to exit before giving up and * migrating. */ do { cpu_spinwait(); } while (!TAILQ_EMPTY(&record->er_tdlist) && gen == record->er_gen && spincount++ < MAX_ADAPTIVE_SPIN); thread_lock(td); /* * If the generation has changed we can poll again * otherwise we need to migrate. */ if (gen != record->er_gen) return; /* * Being on the same CPU as that of the record on which * we need to wait allows us access to the thread * list associated with that CPU. We can then examine the * oldest thread in the queue and wait on its turnstile * until it resumes and so on until a grace period * elapses. * */ counter_u64_add(migrate_count, 1); sched_bind(td, record->er_cpuid); /* * At this point we need to return to the ck code * to scan to see if a grace period has elapsed. * We can't move on to check the thread list, because * in the meantime new threads may have arrived that * in fact belong to a different epoch. */ return; } /* * Try to find a thread in an epoch section on this CPU * waiting on a turnstile. Otherwise find the lowest * priority thread (highest prio value) and drop our priority * to match to allow it to run. */ TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { /* * Propagate our priority to any other waiters to prevent us * from starving them. They will have their original priority * restore on exit from epoch_wait(). */ curwaittd = tdwait->et_td; if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { critical_enter(); thread_unlock(td); thread_lock(curwaittd); sched_prio(curwaittd, td->td_priority); thread_unlock(curwaittd); thread_lock(td); critical_exit(); } if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && ((ts = curwaittd->td_blocked) != NULL)) { /* * We unlock td to allow turnstile_wait to reacquire * the thread lock. Before unlocking it we enter a * critical section to prevent preemption after we * reenable interrupts by dropping the thread lock in * order to prevent curwaittd from getting to run. */ critical_enter(); thread_unlock(td); if (turnstile_lock(ts, &lock, &owner)) { if (ts == curwaittd->td_blocked) { MPASS(TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd)); critical_exit(); turnstile_wait(ts, owner, curwaittd->td_tsqueue); counter_u64_add(turnstile_count, 1); thread_lock(td); return; } turnstile_unlock(ts, lock); } thread_lock(td); critical_exit(); KASSERT(td->td_locks == locksheld, ("%d extra locks held", td->td_locks - locksheld)); } } /* * We didn't find any threads actually blocked on a lock * so we have nothing to do except context switch away. */ counter_u64_add(switch_count, 1); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); /* * Release the thread lock while yielding to * allow other threads to acquire the lock * pointed to by TDQ_LOCKPTR(td). Else a * deadlock like situation might happen. (HPS) */ thread_unlock(td); thread_lock(td); } void epoch_wait_preempt(epoch_t epoch) { struct thread *td; int was_bound; int old_cpu; int old_pinned; u_char old_prio; int locks __unused; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); td = curthread; #ifdef INVARIANTS locks = curthread->td_locks; MPASS(epoch->e_flags & EPOCH_PREEMPT); if ((epoch->e_flags & EPOCH_LOCKED) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_wait() can be long running"); KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle " "of an epoch section of the same epoch")); #endif thread_lock(td); DROP_GIANT(); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; old_prio = td->td_priority; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; sched_bind(td, old_cpu); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL); /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; /* restore thread priority */ sched_prio(td, old_prio); thread_unlock(td); PICKUP_GIANT(); KASSERT(td->td_locks == locks, ("%d residual locks held", td->td_locks - locks)); } static void epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, void *arg __unused) { cpu_spinwait(); } void epoch_wait(epoch_t epoch) { MPASS(cold || epoch != NULL); INIT_CHECK(epoch); MPASS(epoch->e_flags == 0); critical_enter(); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); critical_exit(); } void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) { epoch_record_t er; ck_epoch_entry_t *cb; cb = (void *)ctx; MPASS(callback); /* too early in boot to have epoch set up */ if (__predict_false(epoch == NULL)) goto boottime; #if !defined(EARLY_AP_STARTUP) if (__predict_false(inited < 2)) goto boottime; #endif critical_enter(); *DPCPU_PTR(epoch_cb_count) += 1; er = epoch_currecord(epoch); ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); critical_exit(); return; boottime: callback(ctx); } static void epoch_call_task(void *arg __unused) { ck_stack_entry_t *cursor, *head, *next; ck_epoch_record_t *record; epoch_record_t er; epoch_t epoch; ck_stack_t cb_stack; int i, npending, total; ck_stack_init(&cb_stack); critical_enter(); epoch_enter(global_epoch); for (total = i = 0; i < epoch_count; i++) { if (__predict_false((epoch = allepochs[i]) == NULL)) continue; er = epoch_currecord(epoch); record = &er->er_record; if ((npending = record->n_pending) == 0) continue; ck_epoch_poll_deferred(record, &cb_stack); total += npending - record->n_pending; } epoch_exit(global_epoch); *DPCPU_PTR(epoch_cb_count) -= total; critical_exit(); counter_u64_add(epoch_call_count, total); counter_u64_add(epoch_call_task_count, 1); head = ck_stack_batch_pop_npsc(&cb_stack); for (cursor = head; cursor != NULL; cursor = next) { struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor); next = CK_STACK_NEXT(cursor); entry->function(entry); } } int in_epoch_verbose(epoch_t epoch, int dump_onfail) { struct epoch_tracker *tdwait; struct thread *td; epoch_record_t er; td = curthread; if (td->td_epochnest == 0) return (0); if (__predict_false((epoch) == NULL)) return (0); critical_enter(); er = epoch_currecord(epoch); TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) if (tdwait->et_td == td) { critical_exit(); return (1); } #ifdef INVARIANTS if (dump_onfail) { MPASS(td->td_pinned); printf("cpu: %d id: %d\n", curcpu, td->td_tid); TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) printf("td_tid: %d ", tdwait->et_td->td_tid); printf("\n"); } #endif critical_exit(); return (0); } int in_epoch(epoch_t epoch) { return (in_epoch_verbose(epoch, 0)); } static void epoch_drain_cb(struct epoch_context *ctx) { struct epoch *epoch = __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { mtx_lock(&epoch->e_drain_mtx); wakeup(epoch); mtx_unlock(&epoch->e_drain_mtx); } } void epoch_drain_callbacks(epoch_t epoch) { epoch_record_t er; struct thread *td; int was_bound; int old_pinned; int old_cpu; int cpu; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_drain_callbacks() may sleep!"); /* too early in boot to have epoch set up */ if (__predict_false(epoch == NULL)) return; #if !defined(EARLY_AP_STARTUP) if (__predict_false(inited < 2)) return; #endif DROP_GIANT(); sx_xlock(&epoch->e_drain_sx); mtx_lock(&epoch->e_drain_mtx); td = curthread; thread_lock(td); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; CPU_FOREACH(cpu) epoch->e_drain_count++; CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); sched_bind(td, cpu); epoch_call(epoch, &er->er_drain_ctx, &epoch_drain_cb); } /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; thread_unlock(td); while (epoch->e_drain_count != 0) msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); mtx_unlock(&epoch->e_drain_mtx); sx_xunlock(&epoch->e_drain_sx); PICKUP_GIANT(); } void epoch_thread_init(struct thread *td) { td->td_et = malloc(sizeof(struct epoch_tracker), M_EPOCH, M_WAITOK); } void epoch_thread_fini(struct thread *td) { free(td->td_et, M_EPOCH); } Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 353595) +++ head/sys/kern/subr_trap.c (revision 353596) @@ -1,356 +1,360 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007 The FreeBSD Foundation * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #ifdef VIMAGE #include #endif #ifdef HWPMC_HOOKS #include #endif #include void (*softdep_ast_cleanup)(struct thread *); /* * Define the code needed before returning to user mode, for trap and * syscall. */ void userret(struct thread *td, struct trapframe *frame) { struct proc *p = td->td_proc; CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, td->td_name); KASSERT((p->p_flag & P_WEXIT) == 0, ("Exiting process returns to usermode")); #ifdef DIAGNOSTIC /* * Check that we called signotify() enough. For * multi-threaded processes, where signal distribution might * change due to other threads changing sigmask, the check is * racy and cannot be performed reliably. * If current process is vfork child, indicated by P_PPWAIT, then * issignal() ignores stops, so we block the check to avoid * classifying pending signals. */ if (p->p_numthreads == 1) { PROC_LOCK(p); thread_lock(td); if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed to set signal flags for ast p %p " "td %p fl %x", p, td, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif #ifdef KTRACE KTRUSERRET(td); #endif td_softdep_cleanup(td); MPASS(td->td_su == NULL); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio); #ifdef HWPMC_HOOKS if (PMC_THREAD_HAS_SAMPLES(td)) PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL); #endif /* * Let the scheduler adjust our priority etc. */ sched_userret(td); /* * Check for misbehavior. * * In case there is a callchain tracing ongoing because of * hwpmc(4), skip the scheduler pinning check. * hwpmc(4) subsystem, infact, will collect callchain informations * at ast() checkpoint, which is past userret(). */ WITNESS_WARN(WARN_PANIC, NULL, "userret: returning"); KASSERT(td->td_critnest == 0, ("userret: Returning in a critical section")); +#ifdef EPOCH_TRACE + if (__predict_false(curthread->td_epochnest > 0)) + epoch_trace_list(curthread); +#endif KASSERT(td->td_epochnest == 0, ("userret: Returning in an epoch section")); KASSERT(td->td_locks == 0, ("userret: Returning with %d locks held", td->td_locks)); KASSERT(td->td_rw_rlocks == 0, ("userret: Returning with %d rwlocks held in read mode", td->td_rw_rlocks)); KASSERT(td->td_sx_slocks == 0, ("userret: Returning with %d sx locks held in shared mode", td->td_sx_slocks)); KASSERT(td->td_lk_slocks == 0, ("userret: Returning with %d lockmanager locks held in shared mode", td->td_lk_slocks)); KASSERT((td->td_pflags & TDP_NOFAULTING) == 0, ("userret: Returning with pagefaults disabled")); KASSERT(td->td_no_sleeping == 0, ("userret: Returning with sleep disabled")); KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0, ("userret: Returning with with pinned thread")); KASSERT(td->td_vp_reserv == 0, ("userret: Returning while holding vnode reservation")); KASSERT((td->td_flags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("userret: Returning with stop signals deferred")); KASSERT(td->td_su == NULL, ("userret: Returning with SU cleanup request not handled")); KASSERT(td->td_vslock_sz == 0, ("userret: Returning with vslock-wired space")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s", __func__, td, p->p_pid, td->td_name, curvnet, (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif #ifdef RACCT if (__predict_false(racct_enable && p->p_throttled != 0)) racct_proc_throttled(p); #endif } /* * Process an asynchronous software trap. * This is relatively easy. * This function will return with preemption disabled. */ void ast(struct trapframe *framep) { struct thread *td; struct proc *p; int flags; int sig; td = curthread; p = td->td_proc; CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); td->td_frame = framep; td->td_pticks = 0; /* * This updates the td_flag's for the checks below in one * "atomic" operation with turning off the astpending flag. * If another AST is triggered while we are handling the * AST's saved in flags, the astpending flag will be set and * ast() will be called again. */ thread_lock(td); flags = td->td_flags; td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK | TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND); thread_unlock(td); VM_CNT_INC(v_trap); if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) { addupc_task(td, td->td_profil_addr, td->td_profil_ticks); td->td_profil_ticks = 0; td->td_pflags &= ~TDP_OWEUPC; } #ifdef HWPMC_HOOKS /* Handle Software PMC callchain capture. */ if (PMC_IS_PENDING_CALLCHAIN(td)) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep); #endif if (flags & TDF_ALRMPEND) { PROC_LOCK(p); kern_psignal(p, SIGVTALRM); PROC_UNLOCK(p); } if (flags & TDF_PROFPEND) { PROC_LOCK(p); kern_psignal(p, SIGPROF); PROC_UNLOCK(p); } #ifdef MAC if (flags & TDF_MACPEND) mac_thread_userret(td); #endif if (flags & TDF_NEEDRESCHED) { #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1, __func__); #endif thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL); thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1, __func__); #endif } #ifdef DIAGNOSTIC if (p->p_numthreads == 1 && (flags & TDF_NEEDSIGCHK) == 0) { PROC_LOCK(p); thread_lock(td); /* * Note that TDF_NEEDSIGCHK should be re-read from * td_flags, since signal might have been delivered * after we cleared td_flags above. This is one of * the reason for looping check for AST condition. * See comment in userret() about P_PPWAIT. */ if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed2 to set signal flags for ast p %p td %p " "fl %x %x", p, td, flags, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Check for signals. Unlocked reads of p_pendingcnt or * p_siglist might cause process-directed signal to be handled * later. */ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); } /* * We need to check to see if we have to exit or wait due to a * single threading requirement or some other STOP condition. */ if (flags & TDF_NEEDSUSPCHK) { PROC_LOCK(p); thread_suspend_check(0); PROC_UNLOCK(p); } if (td->td_pflags & TDP_OLDMASK) { td->td_pflags &= ~TDP_OLDMASK; kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0); } userret(td, framep); } const char * syscallname(struct proc *p, u_int code) { static const char unknown[] = "unknown"; struct sysentvec *sv; sv = p->p_sysent; if (sv->sv_syscallnames == NULL || code >= sv->sv_size) return (unknown); return (sv->sv_syscallnames[code]); } Index: head/sys/sys/epoch.h =================================================================== --- head/sys/sys/epoch.h (revision 353595) +++ head/sys/sys/epoch.h (revision 353596) @@ -1,99 +1,100 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EPOCH_H_ #define _SYS_EPOCH_H_ struct epoch_context { void *data[2]; } __aligned(sizeof(void *)); typedef struct epoch_context *epoch_context_t; #ifdef _KERNEL #include #include #include struct epoch; typedef struct epoch *epoch_t; #define EPOCH_PREEMPT 0x1 #define EPOCH_LOCKED 0x2 extern epoch_t global_epoch; extern epoch_t global_epoch_preempt; struct epoch_tracker { TAILQ_ENTRY(epoch_tracker) et_link; struct thread *et_td; ck_epoch_section_t et_section; #ifdef EPOCH_TRACE struct epoch *et_epoch; SLIST_ENTRY(epoch_tracker) et_tlink; const char *et_file; int et_line; #endif } __aligned(sizeof(void *)); typedef struct epoch_tracker *epoch_tracker_t; epoch_t epoch_alloc(const char *name, int flags); void epoch_free(epoch_t epoch); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); void epoch_drain_callbacks(epoch_t epoch); void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)); int in_epoch(epoch_t epoch); int in_epoch_verbose(epoch_t epoch, int dump_onfail); DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); #ifdef EPOCH_TRACE #define EPOCH_FILE_LINE , const char *file, int line #else #define EPOCH_FILE_LINE #endif void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); #ifdef EPOCH_TRACE +void epoch_trace_list(struct thread *); #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et, __FILE__, __LINE__) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et, __FILE__, __LINE__) #else #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et) #endif void epoch_enter(epoch_t epoch); void epoch_exit(epoch_t epoch); void epoch_thread_init(struct thread *); void epoch_thread_fini(struct thread *); #endif /* _KERNEL */ #endif /* _SYS_EPOCH_H_ */